Skip to content
Snippets Groups Projects
pdf.py 2.87 KiB
Newer Older
  • Learn to ignore specific revisions
  • Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    """ Handle PDF
    
    """
    
    import os
    import logging
    import tempfile
    import shutil
    import io
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    import tempfile
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
    import cairo
    import gi
    gi.require_version('Poppler', '0.18')
    from gi.repository import Poppler
    
    from . import abstract
    
    logging.basicConfig(level=logging.DEBUG)
    
    
    class PDFParser(abstract.AbstractParser):
    
        mimetypes = {'application/pdf', }
        meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
                'metadata', 'mod-date', 'producer', 'subject', 'title',
                'viewer-preferences'}
    
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
        def __init__(self, filename):
            super().__init__(filename)
            self.uri = 'file://' + os.path.abspath(self.filename)
    
            self.__scale = 2  # how much precision do we want for the render
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
        def remove_all(self):
            """
                Load the document into Poppler, render pages on PNG,
                and shove those PNG into a new PDF. Metadata from the new
                PDF are removed via Poppler, because there is no way to tell
                cairo to not add "created by cairo" during rendering.
            """
    
            document = Poppler.Document.new_from_file(self.uri, None)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            pages_count = document.get_n_pages()
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            _, tmp_path = tempfile.mkstemp()
            pdf_surface = cairo.PDFSurface(tmp_path, 128, 128)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            pdf_context = cairo.Context(pdf_surface)
    
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            for pagenum in range(pages_count):
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                page = document.get_page(pagenum)
                page_width, page_height = page.get_size()
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
    
                img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, int(page_width) * self.__scale, int(page_height) * self.__scale)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                img_context = cairo.Context(img_surface)
    
    
                img_context.scale(self.__scale, self.__scale)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                page.render_for_printing(img_context)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                img_context.show_page()
    
                buf = io.BytesIO()
                img_surface.write_to_png(buf)
                img_surface.finish()
                buf.seek(0)
    
                img = cairo.ImageSurface.create_from_png(buf)
    
                pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                pdf_context.set_source_surface(img, 0, 0)
                pdf_context.paint()
                pdf_context.show_page()
    
            pdf_surface.finish()
    
    
            # Removes metadata added by Poppler
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            document = Poppler.Document.new_from_file('file://' + tmp_path)
            document.set_producer('')
            document.set_creator('')
            document.save('file://' + os.path.abspath(self.output_filename))
            os.remove(tmp_path)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
            return True
    
        def get_meta(self):
            """ Return a dict with all the meta of the file
            """
    
            document = Poppler.Document.new_from_file(self.uri, None)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            metadata = {}
            for key in self.meta_list:
                if document.get_property(key):
                    metadata[key] = document.get_property(key)
            return metadata