Skip to content
Snippets Groups Projects
Commit 8f446163 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Implement mimetype detection

parent d262f780
No related branches found
No related tags found
No related merge requests found
...@@ -3,6 +3,7 @@ from shutil import copyfile ...@@ -3,6 +3,7 @@ from shutil import copyfile
import argparse import argparse
from src.parsers import pdf from src.parsers import pdf
from src import parser_factory
def create_arg_parser(): def create_arg_parser():
...@@ -19,7 +20,7 @@ def create_arg_parser(): ...@@ -19,7 +20,7 @@ def create_arg_parser():
return parser return parser
def show_meta(file_name:str): def show_meta(file_name:str):
p = pdf.PDFParser(file_name) p = parser_factory(file_name)
for k,v in p.get_meta().items(): for k,v in p.get_meta().items():
print("%s: %s" % (k, v)) print("%s: %s" % (k, v))
...@@ -32,10 +33,10 @@ def main(): ...@@ -32,10 +33,10 @@ def main():
show_meta(f) show_meta(f)
return 0 return 0
elif not args.files: elif not args.files:
return parser.show_help() return argparser.show_help()
copyfile(sys.argv[1] + '.bak', sys.argv[1]) #p = pdf.PDFParser(sys.argv[1])
p = pdf.PDFParser(sys.argv[1]) p = parser_factory.get_parser(sys.argv[1])
p.remove_all() p.remove_all()
p = pdf.PDFParser('OUT_clean.pdf') p = pdf.PDFParser('OUT_clean.pdf')
print("ok") print("ok")
......
import mimetypes
from .parsers import abstract
from .parsers import *
def get_parser(filename: str):
mtype, _ = mimetypes.guess_type(filename)
for c in abstract.AbstractParser.__subclasses__():
if mtype in c.mimetypes:
return c(filename)
...@@ -3,6 +3,7 @@ class AbstractParser(object): ...@@ -3,6 +3,7 @@ class AbstractParser(object):
self.filename = filename self.filename = filename
self.output_filename = filename + '.cleaned' self.output_filename = filename + '.cleaned'
self.meta_list = set() self.meta_list = set()
self.mimetypes = set()
def get_meta(self): def get_meta(self):
raise NotImplementedError raise NotImplementedError
......
...@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG) ...@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
class PDFParser(abstract.AbstractParser): class PDFParser(abstract.AbstractParser):
mimetypes = {'application/pdf', }
meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
'metadata', 'mod-date', 'producer', 'subject', 'title',
'viewer-preferences'}
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.password = None
def remove_all(self): def remove_all(self):
""" """
...@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser): ...@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
PDF are removed via Poppler, because there is no way to tell PDF are removed via Poppler, because there is no way to tell
cairo to not add "created by cairo" during rendering. cairo to not add "created by cairo" during rendering.
""" """
document = Poppler.Document.new_from_file(self.uri, self.password) document = Poppler.Document.new_from_file(self.uri, None)
pages_count = document.get_n_pages() pages_count = document.get_n_pages()
_, tmp_path = tempfile.mkstemp() _, tmp_path = tempfile.mkstemp()
...@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser): ...@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
""" Return a dict with all the meta of the file """ Return a dict with all the meta of the file
""" """
print("URI: %s", self.uri) print("URI: %s", self.uri)
document = Poppler.Document.new_from_file(self.uri, self.password) document = Poppler.Document.new_from_file(self.uri, None)
metadata = {} metadata = {}
for key in self.meta_list: for key in self.meta_list:
if document.get_property(key): if document.get_property(key):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment