From acb9b2d14e3173029aa44e853662d11efbe4fa5e Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 18 Mar 2018 23:48:14 +0100
Subject: [PATCH] Clean metadata

---
 src/parsers/pdf.py    | 37 ++++++++++++++++---------------------
 tests/test_libmat2.py |  5 ++---
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index a77eabd..26985c6 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -7,17 +7,13 @@ import logging
 import tempfile
 import shutil
 import io
+import tempfile
 
 import cairo
 import gi
 gi.require_version('Poppler', '0.18')
 from gi.repository import Poppler
 
-try:
-    from PIL import Image
-except ImportError:
-    Image = None
-
 from . import abstract
 
 logging.basicConfig(level=logging.DEBUG)
@@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG)
 class PDFParser(abstract.AbstractParser):
     def __init__(self, filename):
         super().__init__(filename)
-        self.meta_list = {'title', 'author', 'subject',
-            'keywords', 'creator', 'producer', 'metadata'}
+        self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
+                'metadata', 'mod-date', 'producer', 'subject', 'title',
+                'viewer-preferences'}
         self.uri = 'file://' + os.path.abspath(self.filename)
         self.password = None
 
@@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser):
             and shove those PNG into a new PDF. Metadata from the new
             PDF are removed via Poppler, because there is no way to tell
             cairo to not add "created by cairo" during rendering.
-
-            TODO: Improve the resolution
-            TODO: Don't use a temp file
         """
         document = Poppler.Document.new_from_file(self.uri, self.password)
+        pages_count = document.get_n_pages()
 
-        pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
+        _, tmp_path = tempfile.mkstemp()
+        pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
         pdf_context = cairo.Context(pdf_surface)
 
-        for pagenum in range(document.get_n_pages()):
+        for pagenum in range(pages_count):
             page = document.get_page(pagenum)
             page_width, page_height = page.get_size()
-            logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
+            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
 
             img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
             img_context = cairo.Context(img_surface)
 
             img_context.scale(2, 2)
-            page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
+            page.render_for_printing(img_context)
             img_context.show_page()
 
             buf = io.BytesIO()
@@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser):
             img_surface.finish()
             buf.seek(0)
 
-            #buf = self.__optimize_image_size(buf)
-
             img = cairo.ImageSurface.create_from_png(buf)
             pdf_surface.set_size(page_width*2, page_height*2)
             pdf_context.set_source_surface(img, 0, 0)
@@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser):
 
         pdf_surface.finish()
 
-        # This is removing metadata
-        #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
-        #document.set_producer('totally not MAT2 ;)')
-        #document.set_creator('')
-        #document.save('file://' + os.path.abspath("OUT_clean.pdf"))
+        # This is removing metadata added by Poppler
+        document = Poppler.Document.new_from_file('file://' + tmp_path)
+        document.set_producer('')
+        document.set_creator('')
+        document.save('file://' + os.path.abspath(self.output_filename))
+        os.remove(tmp_path)
 
         return True
 
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 4751aa4..4b36270 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase):
         self.assertTrue(ret)
 
         p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
-        remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)',
-                'producer': 'cairo 1.14.10 (http://cairographics.org)'}
-        self.assertEqual(p.get_meta(), remaining_meta)
+        expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
+        self.assertEqual(p.get_meta(), expected_meta)
-- 
GitLab