Commit 7ec1eff9 authored by jvoisin's avatar jvoisin

Improve the way we parse/display pdf metadata

parent 0239ab3b
......@@ -3,6 +3,7 @@
"""
import os
import re
import logging
import tempfile
import io
......@@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
return True
def __parse_metadata_field(self, data:str) -> dict:
metadata = {}
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
metadata[key] = value
return metadata
def get_meta(self):
""" Return a dict with all the meta of the file
"""
......@@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
for key in self.meta_list:
if document.get_property(key):
metadata[key] = document.get_property(key)
if 'metadata' in metadata:
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
return {**metadata, **parsed_meta}
return metadata
......@@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
"version 6.1.1")
def test_png(self):
p = images.PNGParser('./tests/data/dirty.png')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment