Skip to content
Snippets Groups Projects
Commit 8f446163 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Implement mimetype detection

parent d262f780
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,7 @@ from shutil import copyfile
import argparse
from src.parsers import pdf
from src import parser_factory
def create_arg_parser():
......@@ -19,7 +20,7 @@ def create_arg_parser():
return parser
def show_meta(file_name:str):
p = pdf.PDFParser(file_name)
p = parser_factory(file_name)
for k,v in p.get_meta().items():
print("%s: %s" % (k, v))
......@@ -32,10 +33,10 @@ def main():
show_meta(f)
return 0
elif not args.files:
return parser.show_help()
return argparser.show_help()
copyfile(sys.argv[1] + '.bak', sys.argv[1])
p = pdf.PDFParser(sys.argv[1])
#p = pdf.PDFParser(sys.argv[1])
p = parser_factory.get_parser(sys.argv[1])
p.remove_all()
p = pdf.PDFParser('OUT_clean.pdf')
print("ok")
......
import mimetypes
from .parsers import abstract
from .parsers import *
def get_parser(filename: str):
mtype, _ = mimetypes.guess_type(filename)
for c in abstract.AbstractParser.__subclasses__():
if mtype in c.mimetypes:
return c(filename)
......@@ -3,6 +3,7 @@ class AbstractParser(object):
self.filename = filename
self.output_filename = filename + '.cleaned'
self.meta_list = set()
self.mimetypes = set()
def get_meta(self):
raise NotImplementedError
......
......@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser):
mimetypes = {'application/pdf', }
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
def __init__(self, filename):
super().__init__(filename)
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def remove_all(self):
"""
......@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering.
"""
document = Poppler.Document.new_from_file(self.uri, self.password)
document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp()
......@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
""" Return a dict with all the meta of the file
"""
print("URI: %s", self.uri)
document = Poppler.Document.new_from_file(self.uri, self.password)
document = Poppler.Document.new_from_file(self.uri, None)
metadata = {}
for key in self.meta_list:
if document.get_property(key):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment