Skip to content
Snippets Groups Projects
Commit acb9b2d1 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Clean metadata

parent df3c27d7
No related branches found
No related tags found
No related merge requests found
......@@ -7,17 +7,13 @@ import logging
import tempfile
import shutil
import io
import tempfile
import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler
try:
from PIL import Image
except ImportError:
Image = None
from . import abstract
logging.basicConfig(level=logging.DEBUG)
......@@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
self.meta_list = {'title', 'author', 'subject',
'keywords', 'creator', 'producer', 'metadata'}
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
......@@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser):
and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
TODO: Improve the resolution
TODO: Don't use a temp file
"""
document = Poppler.Document.new_from_file(self.uri, self.password)
pages_count = document.get_n_pages()
pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
_, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
pdf_context = cairo.Context(pdf_surface)
for pagenum in range(document.get_n_pages()):
for pagenum in range(pages_count):
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
img_context = cairo.Context(img_surface)
img_context.scale(2, 2)
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
page.render_for_printing(img_context)
img_context.show_page()
buf = io.BytesIO()
......@@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser):
img_surface.finish()
buf.seek(0)
#buf = self.__optimize_image_size(buf)
img = cairo.ImageSurface.create_from_png(buf)
pdf_surface.set_size(page_width*2, page_height*2)
pdf_context.set_source_surface(img, 0, 0)
......@@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish()
# This is removing metadata
#document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
#document.set_producer('totally not MAT2 ;)')
#document.set_creator('')
#document.save('file://' + os.path.abspath("OUT_clean.pdf"))
# This is removing metadata added by Poppler
document = Poppler.Document.new_from_file('file://' + tmp_path)
document.set_producer('')
document.set_creator('')
document.save('file://' + os.path.abspath(self.output_filename))
os.remove(tmp_path)
return True
......
......@@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase):
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)',
'producer': 'cairo 1.14.10 (http://cairographics.org)'}
self.assertEqual(p.get_meta(), remaining_meta)
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment