Skip to content
Snippets Groups Projects
pdf.py 4.96 KiB
Newer Older
  • Learn to ignore specific revisions
  • Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    """ Handle PDF
    
    """
    
    import os
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    import logging
    import tempfile
    import io
    
    from distutils.version import LooseVersion
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
    import cairo
    import gi
    gi.require_version('Poppler', '0.18')
    
    from gi.repository import Poppler, GLib
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
    from . import abstract
    
    
    poppler_version = Poppler.get_version()
    
    if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover
    
        raise ValueError("MAT2 needs at least Poppler version 0.46 to work. \
    
    The installed version is %s." % poppler_version)  # pragma: no cover
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
    class PDFParser(abstract.AbstractParser):
    
        mimetypes = {'application/pdf', }
        meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
    
                     'metadata', 'mod-date', 'producer', 'subject', 'title',
                     'viewer-preferences'}
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
        def __init__(self, filename):
            super().__init__(filename)
            self.uri = 'file://' + os.path.abspath(self.filename)
    
            self.__scale = 2  # how much precision do we want for the render
    
            try:  # Check now that the file is valid, to avoid surprises later
                Poppler.Document.new_from_file(self.uri, None)
            except GLib.GError:  # Invalid PDF
                raise ValueError
    
        def remove_all_lightweight(self):
            """
                Load the document into Poppler, render pages on a new PDFSurface.
            """
            document = Poppler.Document.new_from_file(self.uri, None)
            pages_count = document.get_n_pages()
    
            tmp_path = tempfile.mkstemp()[1]
    
            pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)  # resized later anyway
    
            pdf_context = cairo.Context(pdf_surface)  # context draws on the surface
    
            for pagenum in range(pages_count):
                logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
                page = document.get_page(pagenum)
                page_width, page_height = page.get_size()
                pdf_surface.set_size(page_width, page_height)
                pdf_context.save()
                page.render_for_printing(pdf_context)
                pdf_context.restore()
                pdf_context.show_page()  # draw pdf_context on pdf_surface
            pdf_surface.finish()
    
            self.__remove_superficial_meta(tmp_path, self.output_filename)
            os.remove(tmp_path)
    
            return True
    
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
        def remove_all(self):
            """
                Load the document into Poppler, render pages on PNG,
    
                and shove those PNG into a new PDF.
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            """
    
            document = Poppler.Document.new_from_file(self.uri, None)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            pages_count = document.get_n_pages()
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            _, tmp_path = tempfile.mkstemp()
    
            pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            pdf_context = cairo.Context(pdf_surface)
    
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            for pagenum in range(pages_count):
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                page = document.get_page(pagenum)
                page_width, page_height = page.get_size()
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
    
                width = int(page_width) * self.__scale
                height = int(page_height) * self.__scale
                img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                img_context = cairo.Context(img_surface)
    
    
                img_context.scale(self.__scale, self.__scale)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                page.render_for_printing(img_context)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                img_context.show_page()
    
                buf = io.BytesIO()
                img_surface.write_to_png(buf)
                img_surface.finish()
                buf.seek(0)
    
                img = cairo.ImageSurface.create_from_png(buf)
    
                pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
                pdf_context.set_source_surface(img, 0, 0)
                pdf_context.paint()
    
                pdf_context.show_page()  # draw pdf_context on pdf_surface
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
            pdf_surface.finish()
    
    
            # Removes metadata added by Poppler
    
            self.__remove_superficial_meta(tmp_path, self.output_filename)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            os.remove(tmp_path)
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    
            return True
    
    
        @staticmethod
        def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
    
            document = Poppler.Document.new_from_file('file://' + in_file)
            document.set_producer('')
            document.set_creator('')
    
            document.set_creation_date(-1)
    
            document.save('file://' + os.path.abspath(out_file))
            return True
    
    
        @staticmethod
        def __parse_metadata_field(data: str) -> dict:
    
            metadata = {}
            for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
                metadata[key] = value
            return metadata
    
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
        def get_meta(self):
            """ Return a dict with all the meta of the file
            """
            metadata = {}
    
            document = Poppler.Document.new_from_file(self.uri, None)
    
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            for key in self.meta_list:
                if document.get_property(key):
                    metadata[key] = document.get_property(key)
    
            if 'metadata' in metadata:
    
                parsed_meta = self.__parse_metadata_field(metadata['metadata'])
    
                for key, value in parsed_meta.items():
                    metadata[key] = value
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
            return metadata