From 8f44616366f9ca395314d59a98840e2912f488df Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Mon, 19 Mar 2018 23:43:49 +0100
Subject: [PATCH] Implement mimetype detection

---
 main.py                 |  9 +++++----
 src/parser_factory.py   | 10 ++++++++++
 src/parsers/abstract.py |  1 +
 src/parsers/pdf.py      | 13 +++++++------
 4 files changed, 23 insertions(+), 10 deletions(-)
 create mode 100644 src/parser_factory.py

diff --git a/main.py b/main.py
index e4157e6..4b965b4 100644
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@ from shutil import copyfile
 import argparse
 
 from src.parsers import pdf
+from src import parser_factory
 
 
 def create_arg_parser():
@@ -19,7 +20,7 @@ def create_arg_parser():
     return parser
 
 def show_meta(file_name:str):
-    p = pdf.PDFParser(file_name)
+    p = parser_factory(file_name)
     for k,v in p.get_meta().items():
         print("%s: %s" % (k, v))
 
@@ -32,10 +33,10 @@ def main():
             show_meta(f)
         return 0
     elif not args.files:
-        return parser.show_help()
+        return argparser.show_help()
 
-    copyfile(sys.argv[1] + '.bak', sys.argv[1])
-    p = pdf.PDFParser(sys.argv[1])
+    #p = pdf.PDFParser(sys.argv[1])
+    p = parser_factory.get_parser(sys.argv[1])
     p.remove_all()
     p = pdf.PDFParser('OUT_clean.pdf')
     print("ok")
diff --git a/src/parser_factory.py b/src/parser_factory.py
new file mode 100644
index 0000000..a93595a
--- /dev/null
+++ b/src/parser_factory.py
@@ -0,0 +1,10 @@
+import mimetypes
+
+from .parsers import abstract
+from .parsers import *
+
+def get_parser(filename: str):
+    mtype, _ = mimetypes.guess_type(filename)
+    for c in abstract.AbstractParser.__subclasses__():
+        if mtype in c.mimetypes:
+            return c(filename)
diff --git a/src/parsers/abstract.py b/src/parsers/abstract.py
index d0e7108..80bb812 100644
--- a/src/parsers/abstract.py
+++ b/src/parsers/abstract.py
@@ -3,6 +3,7 @@ class AbstractParser(object):
         self.filename = filename
         self.output_filename = filename + '.cleaned'
         self.meta_list = set()
+        self.mimetypes = set()
 
     def get_meta(self):
         raise NotImplementedError
diff --git a/src/parsers/pdf.py b/src/parsers/pdf.py
index 26985c6..e7bd00d 100644
--- a/src/parsers/pdf.py
+++ b/src/parsers/pdf.py
@@ -20,13 +20,14 @@ logging.basicConfig(level=logging.DEBUG)
 
 
 class PDFParser(abstract.AbstractParser):
+    mimetypes = {'application/pdf', }
+    meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
+            'metadata', 'mod-date', 'producer', 'subject', 'title',
+            'viewer-preferences'}
+
     def __init__(self, filename):
         super().__init__(filename)
-        self.meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
-                'metadata', 'mod-date', 'producer', 'subject', 'title',
-                'viewer-preferences'}
         self.uri = 'file://' + os.path.abspath(self.filename)
-        self.password = None
 
     def remove_all(self):
         """
@@ -35,7 +36,7 @@ class PDFParser(abstract.AbstractParser):
             PDF are removed via Poppler, because there is no way to tell
             cairo to not add "created by cairo" during rendering.
         """
-        document = Poppler.Document.new_from_file(self.uri, self.password)
+        document = Poppler.Document.new_from_file(self.uri, None)
         pages_count = document.get_n_pages()
 
         _, tmp_path = tempfile.mkstemp()
@@ -80,7 +81,7 @@ class PDFParser(abstract.AbstractParser):
         """ Return a dict with all the meta of the file
         """
         print("URI: %s", self.uri)
-        document = Poppler.Document.new_from_file(self.uri, self.password)
+        document = Poppler.Document.new_from_file(self.uri, None)
         metadata = {}
         for key in self.meta_list:
             if document.get_property(key):
-- 
GitLab