diff --git a/main.py b/main.py index be2508e8d72fece740b9e0ed0e637bfdcbb942d4..2cb05ffdeea7a33dd23827c3b5289e8b424d3ee5 100755 --- a/main.py +++ b/main.py @@ -31,6 +31,8 @@ def create_arg_parser(): help='list all supported fileformats') info.add_argument('-s', '--show', action='store_true', help='list all the harmful metadata of a file without removing them') + info.add_argument('-L', '--lightweight', action='store_true', + help='remove SOME metadata') return parser @@ -50,7 +52,7 @@ def show_meta(filename:str): print(" %s: harmful content" % k) -def clean_meta(filename:str): +def clean_meta(filename:str, is_lightweigth:bool): if not __check_file(filename, os.R_OK|os.W_OK): return @@ -58,7 +60,10 @@ def clean_meta(filename:str): if p is None: print("[-] %s's format (%s) is not supported" % (filename, mtype)) return - p.remove_all() + if is_lightweigth: + p.remove_all_lightweight() + else: + p.remove_all() def show_parsers(): @@ -78,12 +83,12 @@ def __get_files_recursively(files): for _f in _files: yield os.path.join(path, _f) -def __do_clean_async(q): +def __do_clean_async(is_lightweigth, q): while True: f = q.get() if f is None: # nothing more to process return - clean_meta(f) + clean_meta(is_lightweigth, f) q.task_done() @@ -109,7 +114,7 @@ def main(): q.put(f) for _ in range(multiprocessing.cpu_count()): - worker = Thread(target=__do_clean_async, args=(q, )) + worker = Thread(target=__do_clean_async, args=(mode, q)) worker.start() threads.append(worker) diff --git a/src/abstract.py b/src/abstract.py index 04c1535f69235b52217050626bb83536e940c561..93e842187630f189062eaff4ef55261109ad663b 100644 --- a/src/abstract.py +++ b/src/abstract.py @@ -16,3 +16,7 @@ class AbstractParser(abc.ABC): @abc.abstractmethod def remove_all(self) -> bool: pass + + def remove_all_lightweight(self) -> bool: + """ Remove _SOME_ metadata. """ + return self.remove_all() diff --git a/src/pdf.py b/src/pdf.py index c11944964ad942e6b8960faf10cd396b68c20954..6e639cde2387d7a3f315985ca090630a0b1619e8 100644 --- a/src/pdf.py +++ b/src/pdf.py @@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser): self.uri = 'file://' + os.path.abspath(self.filename) self.__scale = 2 # how much precision do we want for the render + def remove_all_lightweight(self): + """ + Load the document into Poppler, render pages on a new PDFSurface. + """ + document = Poppler.Document.new_from_file(self.uri, None) + pages_count = document.get_n_pages() + + tmp_path = tempfile.mkstemp()[1] + pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) + pdf_context = cairo.Context(pdf_surface) # context draws on the surface + + for pagenum in range(pages_count): + logging.info("Rendering page %d/%d", pagenum + 1, pages_count) + page = document.get_page(pagenum) + page_width, page_height = page.get_size() + pdf_surface.set_size(page_width, page_height) + pdf_context.save() + page.render_for_printing(pdf_context) + pdf_context.restore() + pdf_context.show_page() # draw pdf_context on pdf_surface + pdf_surface.finish() + + self.__remove_superficial_meta(tmp_path, self.output_filename) + os.remove(tmp_path) + + return True + def remove_all(self): """ Load the document into Poppler, render pages on PNG, - and shove those PNG into a new PDF. Metadata from the new - PDF are removed via Poppler, because there is no way to tell - cairo to not add "created by cairo" during rendering. + and shove those PNG into a new PDF. """ document = Poppler.Document.new_from_file(self.uri, None) pages_count = document.get_n_pages() _, tmp_path = tempfile.mkstemp() - pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) + pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway pdf_context = cairo.Context(pdf_surface) for pagenum in range(pages_count): @@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser): pdf_surface.finish() # Removes metadata added by Poppler - document = Poppler.Document.new_from_file('file://' + tmp_path) - document.set_producer('') - document.set_creator('') - document.save('file://' + os.path.abspath(self.output_filename)) + self.__remove_superficial_meta(tmp_path, self.output_filename) os.remove(tmp_path) return True + def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool: + document = Poppler.Document.new_from_file('file://' + in_file) + document.set_producer('') + document.set_creator('') + document.save('file://' + os.path.abspath(out_file)) + return True + def __parse_metadata_field(self, data:str) -> dict: metadata = {} diff --git a/tests/test_climat2.py b/tests/test_climat2.py index b9c52b5d6c6e2f3ab10fc45832fc89d0a590ee30..64345eb7883167f725ad6857f573465c7bfe8088 100644 --- a/tests/test_climat2.py +++ b/tests/test_climat2.py @@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase): def test_help(self): proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE) stdout, _ = proc.communicate() - self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) + self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout) def test_no_arg(self): proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE) stdout, _ = proc.communicate() - self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout) + self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout) class TestGetMeta(unittest.TestCase): diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 6141dbe8606426616326ab1c9dd8b2be75050eef..34f7301845fd326721e7951e300ead8c3677a040 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase): os.remove('./tests/data/clean.odt') +class TestLightWeightCleaning(unittest.TestCase): + def test_pdf(self): + shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') + p = pdf.PDFParser('./tests/data/clean.pdf') + + meta = p.get_meta() + self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') + + ret = p.remove_all_lightweight() + self.assertTrue(ret) + + p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') + expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} + self.assertEqual(p.get_meta(), expected_meta) + + os.remove('./tests/data/clean.pdf') + + def test_png(self): + shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') + p = images.PNGParser('./tests/data/clean.png') + + meta = p.get_meta() + self.assertEqual(meta['Comment'], 'This is a comment, be careful!') + + ret = p.remove_all_lightweight() + self.assertTrue(ret) + + p = images.PNGParser('./tests/data/clean.png.cleaned') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.png') class TestCleaning(unittest.TestCase): def test_pdf(self):