Commit 96299c6a authored by jvoisin's avatar jvoisin

Add lightweight processing for PDF

parent 6f4ed249
......@@ -31,6 +31,8 @@ def create_arg_parser():
help='list all supported fileformats')
info.add_argument('-s', '--show', action='store_true',
help='list all the harmful metadata of a file without removing them')
info.add_argument('-L', '--lightweight', action='store_true',
help='remove SOME metadata')
return parser
......@@ -50,7 +52,7 @@ def show_meta(filename:str):
print(" %s: harmful content" % k)
def clean_meta(filename:str):
def clean_meta(filename:str, is_lightweigth:bool):
if not __check_file(filename, os.R_OK|os.W_OK):
return
......@@ -58,7 +60,10 @@ def clean_meta(filename:str):
if p is None:
print("[-] %s's format (%s) is not supported" % (filename, mtype))
return
p.remove_all()
if is_lightweigth:
p.remove_all_lightweight()
else:
p.remove_all()
def show_parsers():
......@@ -78,12 +83,12 @@ def __get_files_recursively(files):
for _f in _files:
yield os.path.join(path, _f)
def __do_clean_async(q):
def __do_clean_async(is_lightweigth, q):
while True:
f = q.get()
if f is None: # nothing more to process
return
clean_meta(f)
clean_meta(is_lightweigth, f)
q.task_done()
......@@ -109,7 +114,7 @@ def main():
q.put(f)
for _ in range(multiprocessing.cpu_count()):
worker = Thread(target=__do_clean_async, args=(q, ))
worker = Thread(target=__do_clean_async, args=(mode, q))
worker.start()
threads.append(worker)
......
......@@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod
def remove_all(self) -> bool:
pass
def remove_all_lightweight(self) -> bool:
""" Remove _SOME_ metadata. """
return self.remove_all()
......@@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 2 # how much precision do we want for the render
def remove_all_lightweight(self):
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
tmp_path = tempfile.mkstemp()[1]
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)
pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count):
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
pdf_surface.set_size(page_width, page_height)
pdf_context.save()
page.render_for_printing(pdf_context)
pdf_context.restore()
pdf_context.show_page() # draw pdf_context on pdf_surface
pdf_surface.finish()
self.__remove_superficial_meta(tmp_path, self.output_filename)
os.remove(tmp_path)
return True
def remove_all(self):
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
and shove those PNG into a new PDF.
"""
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
pdf_context = cairo.Context(pdf_surface)
for pagenum in range(pages_count):
......@@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish()
# Removes metadata added by Poppler
document = Poppler.Document.new_from_file('file://' + tmp_path)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(self.output_filename))
self.__remove_superficial_meta(tmp_path, self.output_filename)
os.remove(tmp_path)
return True
def __remove_superficial_meta(self, in_file:str, out_file: str) -> bool:
document = Poppler.Document.new_from_file('file://' + in_file)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(out_file))
return True
def __parse_metadata_field(self, data:str) -> dict:
metadata = {}
......
......@@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
def test_help(self):
proc = subprocess.Popen(['./main.py', '--help'], stdout=subprocess.PIPE)
stdout, _ = proc.communicate()
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
def test_no_arg(self):
proc = subprocess.Popen(['./main.py'], stdout=subprocess.PIPE)
stdout, _ = proc.communicate()
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]', stdout)
self.assertIn(b'usage: main.py [-h] [-c] [-l] [-s] [-L] [files [files ...]]', stdout)
class TestGetMeta(unittest.TestCase):
......
......@@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
os.remove('./tests/data/clean.odt')
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
ret = p.remove_all_lightweight()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
ret = p.remove_all_lightweight()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.png.cleaned')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.png')
class TestCleaning(unittest.TestCase):
def test_pdf(self):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment