diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py index a77eabde9b962854d2d202960c7bc7d4948f0693..26985c6d4d8ca1ae97a665181197bb1759f390a4 100644 --- a/src/parsers/pdf.py +++ b/src/parsers/pdf.py @@ -7,17 +7,13 @@ import logging import tempfile import shutil import io +import tempfile import cairo import gi gi.require_version('Poppler', '0.18') from gi.repository import Poppler -try: - from PIL import Image -except ImportError: - Image = None - from . import abstract logging.basicConfig(level=logging.DEBUG) @@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG) class PDFParser(abstract.AbstractParser): def __init__(self, filename): super().__init__(filename) - self.meta_list = {'title', 'author', 'subject', - 'keywords', 'creator', 'producer', 'metadata'} + self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', + 'metadata', 'mod-date', 'producer', 'subject', 'title', + 'viewer-preferences'} self.uri = 'file://' + os.path.abspath(self.filename) self.password = None @@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser): and shove those PNG into a new PDF. Metadata from the new PDF are removed via Poppler, because there is no way to tell cairo to not add "created by cairo" during rendering. - - TODO: Improve the resolution - TODO: Don't use a temp file """ document = Poppler.Document.new_from_file(self.uri, self.password) + pages_count = document.get_n_pages() - pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) + _, tmp_path = tempfile.mkstemp() + pdf_surface = cairo.PDFSurface(tmp_path, 128, 128) pdf_context = cairo.Context(pdf_surface) - for pagenum in range(document.get_n_pages()): + for pagenum in range(pages_count): page = document.get_page(pagenum) page_width, page_height = page.get_size() - logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) + logging.info("Rendering page %d/%d", pagenum + 1, pages_count) img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) img_context = cairo.Context(img_surface) img_context.scale(2, 2) - page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) + page.render_for_printing(img_context) img_context.show_page() buf = io.BytesIO() @@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser): img_surface.finish() buf.seek(0) - #buf = self.__optimize_image_size(buf) - img = cairo.ImageSurface.create_from_png(buf) pdf_surface.set_size(page_width*2, page_height*2) pdf_context.set_source_surface(img, 0, 0) @@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser): pdf_surface.finish() - # This is removing metadata - #document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) - #document.set_producer('totally not MAT2 ;)') - #document.set_creator('') - #document.save('file://' + os.path.abspath("OUT_clean.pdf")) + # This is removing metadata added by Poppler + document = Poppler.Document.new_from_file('file://' + tmp_path) + document.set_producer('') + document.set_creator('') + document.save('file://' + os.path.abspath(self.output_filename)) + os.remove(tmp_path) return True diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 4751aa4eb86332dc3ccb58c14f48a329c152b0ac..4b36270381001efe85c2ea27b1aad760712360fc 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') - remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', - 'producer': 'cairo 1.14.10 (http://cairographics.org)'} - self.assertEqual(p.get_meta(), remaining_meta) + expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1} + self.assertEqual(p.get_meta(), expected_meta)