Skip to content
Snippets Groups Projects
Commit df3c27d7 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Improve the testsuite

parent 06976537
No related branches found
No related tags found
No related merge requests found
__version__ = '2.0'
class AbstractParser(object):
def __init__(self, filename: str):
self.filename = filename
self.meta_list = set()
def get_meta(self):
raise NotImplementedError
def remove_all(self):
raise NotImplementedError
""" Handle PDF
"""
import os
import logging
import tempfile
import shutil
import io
import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler, Gio, GLib
try:
from PIL import Image
except ImportError:
Image = None
from . import abstract
logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
self.meta_list = {'title', 'author', 'subject',
'keywords', 'creator', 'producer', 'metadata'}
self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def remove_all(self):
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
TODO: Improve the resolution
TODO: Don't use a temp file
"""
document = Poppler.Document.new_from_file(self.uri, self.password)
pdf_out = io.BytesIO()
pdf_surface = cairo.PDFSurface(pdf_out, 128, 128)
pdf_context = cairo.Context(pdf_surface)
for pagenum in range(document.get_n_pages()):
page = document.get_page(pagenum)
page_width, page_height = page.get_size()
logging.info("Rendering page %d/%d", pagenum + 1, document.get_n_pages())
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width)*2, int(page_height)*2)
img_context = cairo.Context(img_surface)
img_context.scale(2, 2)
page.render_for_printing_with_options(img_context, Poppler.PrintFlags.DOCUMENT)
img_context.show_page()
buf = io.BytesIO()
img_surface.write_to_png(buf)
img_surface.finish()
buf.seek(0)
img = cairo.ImageSurface.create_from_png(buf)
pdf_surface.set_size(page_width*2, page_height*2)
pdf_context.set_source_surface(img, 0, 0)
pdf_context.paint()
pdf_context.show_page()
pdf_surface.finish()
b = GLib.Bytes(pdf_out.getvalue())
input_stream = Gio.MemoryInputStream.new_from_bytes(b)
out_document = Poppler.Document.new_from_stream(input_stream, -1, self.password, None)
metadata = {}
for key in self.meta_list:
if out_document.get_property(key):
metadata[key] = str(out_document.get_property(key))
out_document.set_producer('totally not MAT2 ;)')
out_document.set_creator('')
print("AFTER")
metadata = {}
for key in self.meta_list:
if out_document.get_property(key):
metadata[key] = str(out_document.get_property(key))
print("LOL")
out_document.save('file://' + os.path.abspath("olol.pdf"))
print(metadata)
return True
def get_meta(self):
""" Return a dict with all the meta of the file
"""
print("URI: %s", self.uri)
document = Poppler.Document.new_from_file(self.uri, self.password)
metadata = {}
for key in self.meta_list:
if document.get_property(key):
metadata[key] = str(document.get_property(key))
return metadata
class AbstractParser(object):
def __init__(self, filename: str):
self.filename = filename
self.output_filename = filename + '.cleaned'
self.meta_list = set()
def get_meta(self):
......
......@@ -31,20 +31,6 @@ class PDFParser(abstract.AbstractParser):
self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def __optimize_image_size(self, img: io.BytesIO) -> io.BytesIO:
""" This is useless as fuck. """
if Image is None:
return img
ret = io.BytesIO()
im = Image.open(img)
w, h = im.size
resized = im.resize((w, h), Image.ANTIALIAS)
resized.save(ret, optimize=True, format="PNG")
ret.seek(0)
return ret
def remove_all(self):
"""
Load the document into Poppler, render pages on PNG,
......@@ -57,7 +43,7 @@ class PDFParser(abstract.AbstractParser):
"""
document = Poppler.Document.new_from_file(self.uri, self.password)
pdf_surface = cairo.PDFSurface("OUT.pdf", 128, 128)
pdf_surface = cairo.PDFSurface(self.output_filename, 128, 128)
pdf_context = cairo.Context(pdf_surface)
for pagenum in range(document.get_n_pages()):
......@@ -87,10 +73,11 @@ class PDFParser(abstract.AbstractParser):
pdf_surface.finish()
document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
document.set_producer('totally not MAT2 ;)')
document.set_creator('')
document.save('file://' + os.path.abspath("OUT_clean.pdf"))
# This is removing metadata
#document = Poppler.Document.new_from_file('file://' + os.path.abspath('OUT.pdf'), self.password)
#document.set_producer('totally not MAT2 ;)')
#document.set_creator('')
#document.save('file://' + os.path.abspath("OUT_clean.pdf"))
return True
......
......@@ -10,18 +10,27 @@ from src.parsers import pdf
class TestGetMeta(unittest.TestCase):
def test_pdf(self):
p = pdf.PDFParser('./tests/data/dirty.pdf')
meta = p.get_meta().items()
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
class TestCleaning(unittest.TestCase):
def setUp(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
def tearDown(self):
#os.remove('./tests/data/clean.pdf')
pass
os.remove('./tests/data/clean.pdf')
def test_pdf(self):
p = pdf.PDFParser('./tests/data/clean.pdf')
p.remove_all()
#self.assertEqual(p.get_meta(), {})
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.pdf.cleaned')
remaining_meta = {'creator': 'cairo 1.14.10 (http://cairographics.org)',
'producer': 'cairo 1.14.10 (http://cairographics.org)'}
self.assertEqual(p.get_meta(), remaining_meta)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment