Skip to content
Snippets Groups Projects
Commit acb9b2d1 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Clean metadata

parent df3c27d7
No related branches found
No related tags found
No related merge requests found
...@@ -7,17 +7,13 @@ import logging ...@@ -7,17 +7,13 @@ import logging
import tempfile import tempfile
import shutil import shutil
import io import io
import tempfile
import cairo import cairo
import gi import gi
gi.require_version('Poppler', '0.18') gi.require_version('Poppler', '0.18')
from gi.repository import Poppler from gi.repository import Poppler
try:
from PIL import Image
except ImportError:
Image = None
from . import abstract from . import abstract
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
...@@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG) ...@@ -26,8 +22,9 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser): class PDFParser(abstract.AbstractParser):
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.meta_list = {'title', 'author', 'subject', self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'keywords', 'creator', 'producer', 'metadata'} 'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None self.password = None
...@@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser): ...@@ -37,25 +34,24 @@ class PDFParser(abstract.AbstractParser):
and shove those PNG into a new PDF. Metadata from the new and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering. cairo to not add "created by cairo" during rendering.
TODO: Improve the resolution
TODO: Don't use a temp file
""" """
document = Poppler.Document.new_from_file(self.uri, self.password) document = Poppler.Document.new_from_file(self.uri, self.password)
pages_count = document.get_n_pages()
pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128) _, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
pdf_context = cairo.Context(pdf_surface) pdf_context = cairo.Context(pdf_surface)
for pagenum in range(document.get_n_pages()): for pagenum in range(pages_count):
page = document.get_page(pagenum) page = document.get_page(pagenum)
page_width, page_height = page.get_size() page_width, page_height = page.get_size()
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages()) logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2) img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
img_context = cairo.Context(img_surface) img_context = cairo.Context(img_surface)
img_context.scale(2, 2) img_context.scale(2, 2)
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT) page.render_for_printing(img_context)
img_context.show_page() img_context.show_page()
buf = io.BytesIO() buf = io.BytesIO()
...@@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser): ...@@ -63,8 +59,6 @@ class PDFParser(abstract.AbstractParser):
img_surface.finish() img_surface.finish()
buf.seek(0) buf.seek(0)
#buf = self.__optimize_image_size(buf)
img = cairo.ImageSurface.create_from_png(buf) img = cairo.ImageSurface.create_from_png(buf)
pdf_surface.set_size(page_width*2, page_height*2) pdf_surface.set_size(page_width*2, page_height*2)
pdf_context.set_source_surface(img, 0, 0) pdf_context.set_source_surface(img, 0, 0)
...@@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser): ...@@ -73,11 +67,12 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish() pdf_surface.finish()
# This is removing metadata # This is removing metadata added by Poppler
#document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password) document = Poppler.Document.new_from_file('file://' + tmp_path)
#document.set_producer('totally not MAT2 ;)') document.set_producer('')
#document.set_creator('') document.set_creator('')
#document.save('file://' + os.path.abspath("OUT_clean.pdf")) document.save('file://' + os.path.abspath(self.output_filename))
os.remove(tmp_path)
return True return True
......
...@@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase): ...@@ -31,6 +31,5 @@ class TestCleaning(unittest.TestCase):
self.assertTrue(ret) self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned') p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)', expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
'producer': 'cairo 1.14.10 (http://cairographics.org)'} self.assertEqual(p.get_meta(), expected_meta)
self.assertEqual(p.get_meta(), remaining_meta)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment