From 7ec1eff96e3125890b268dbafeebefe6fc923ef2 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Wed, 11 Apr 2018 23:20:59 +0200
Subject: [PATCH] Improve the way we parse/display pdf metadata

---
 src/pdf.py            | 11 +++++++++++
 tests/test_libmat2.py |  4 ++++
 2 files changed, 15 insertions(+)

diff --git a/src/pdf.py b/src/pdf.py
index 96eec13..c119449 100644
--- a/src/pdf.py
+++ b/src/pdf.py
@@ -3,6 +3,7 @@
 """
 
 import os
+import re
 import logging
 import tempfile
 import io
@@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
 
         return True
 
+
+    def __parse_metadata_field(self, data:str) -> dict:
+        metadata = {}
+        for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
+            metadata[key] = value
+        return metadata
+
     def get_meta(self):
         """ Return a dict with all the meta of the file
         """
@@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
         for key in self.meta_list:
             if document.get_property(key):
                 metadata[key] = document.get_property(key)
+        if 'metadata' in metadata:
+            parsed_meta =  self.__parse_metadata_field(metadata['metadata'])
+            return {**metadata, **parsed_meta}
         return metadata
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 4cfb80a..6141dbe 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase):
         meta = p.get_meta()
         self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
         self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
+        self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
+        self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
+                "3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
+                "version 6.1.1")
 
     def test_png(self):
         p = images.PNGParser('./tests/data/dirty.png')
-- 
GitLab