From acb9b2d14e3173029aa44e853662d11efbe4fa5e Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Sun, 18 Mar 2018 23:48:14 +0100 Subject: [PATCH] Clean metadata --- src/parsers/pdf.py | 37 ++++++++++++++++--------------------- tests/test_libmat2.py | 5 ++--- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index a77eabd..26985c6 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py @@ -7,17 +7,13 @@ import logging import tempfile import shutil import io +import tempfile import cairo import gi gi.require_version('Poppler', '0.18') from gi.repository import Poppler -try: - from PIL import Image -except ImportError: - Image = None - from . import abstract logging.basicConfig(level=logging.DEBUG) @@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG) class PDFParser(abstract.AbstractParser): def __init__(self, filename): super().__init__(filename) - self.meta_list = {'title', 'author', 'subject', - 'keywords', 'creator', 'producer', 'metadata'} + self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', + 'metadata', 'mod-date', 'producer', 'subject', 'title', + 'viewer-preferences'} self.uri = 'file://' + os.path.abspath(self.filename) self.password = None @@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser): and shove those PNG into a new PDF. Metadata from the new PDF are removed via Poppler, because there is no way to tell cairo to not add "created by cairo" during rendering. - - TODO: Improve the resolution - TODO: Don't use a temp file """ document = Poppler.Document.new_from_file(self.uri, self.password) + pages_count = document.get_n_pages() - pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) + _, tmp_path = tempfile.mkstemp() + pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) pdf_context = cairo.Context(pdf_surface) - for pagenum in range(document.get_n_pages()): + for pagenum in range(pages_count): page = document.get_page(pagenum) page_width, page_height = page.get_size() - logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) + logging.info("Rendering page %d/%d", pagenum + 1, pages_count) img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) img_context = cairo.Context(img_surface) img_context.scale(2, 2) - page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) + page.render_for_printing(img_context) img_context.show_page() buf = io.BytesIO() @@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser): img_surface.finish() buf.seek(0) - #buf = self.__optimize_image_size(buf) - img = cairo.ImageSurface.create_from_png(buf) pdf_surface.set_size(page_width*2, page_height*2) pdf_context.set_source_surface(img, 0, 0) @@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser): pdf_surface.finish() - # This is removing metadata - #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) - #document.set_producer('totally not MAT2 ;)') - #document.set_creator('') - #document.save('file://' + os.path.abspath("OUT_clean.pdf")) + # This is removing metadata added by Poppler + document = Poppler.Document.new_from_file('file://' + tmp_path) + document.set_producer('') + document.set_creator('') + document.save('file://' + os.path.abspath(self.output_filename)) + os.remove(tmp_path) return True diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 4751aa4..4b36270 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') - remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', - 'producer': 'cairo 1.14.10 (http://cairographics.org)'} - self.assertEqual(p.get_meta(), remaining_meta) + expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} + self.assertEqual(p.get_meta(), expected_meta) -- GitLab