Skip to content
Snippets Groups Projects
Commit 7ec1eff9 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Improve the way we parse/display pdf metadata

parent 0239ab3b
Branches
Tags
No related merge requests found
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
""" """
import os import os
import re
import logging import logging
import tempfile import tempfile
import io import io
...@@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser): ...@@ -76,6 +77,13 @@ class PDFParser(abstract.AbstractParser):
return True return True
def __parse_metadata_field(self, data:str) -> dict:
metadata = {}
for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
metadata[key] = value
return metadata
def get_meta(self): def get_meta(self):
""" Return a dict with all the meta of the file """ Return a dict with all the meta of the file
""" """
...@@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser): ...@@ -84,4 +92,7 @@ class PDFParser(abstract.AbstractParser):
for key in self.meta_list: for key in self.meta_list:
if document.get_property(key): if document.get_property(key):
metadata[key] = document.get_property(key) metadata[key] = document.get_property(key)
if 'metadata' in metadata:
parsed_meta = self.__parse_metadata_field(metadata['metadata'])
return {**metadata, **parsed_meta}
return metadata return metadata
...@@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase): ...@@ -23,6 +23,10 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14') self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'") self.assertEqual(meta['creator'], "'Certified by IEEE PDFeXpress at 03/19/2016 2:56:07 AM'")
self.assertEqual(meta['DocumentID'], "uuid:4a1a79c8-404e-4d38-9580-5bc081036e61")
self.assertEqual(meta['PTEX.Fullbanner'], "This is pdfTeX, Version " \
"3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea " \
"version 6.1.1")
def test_png(self): def test_png(self):
p = images.PNGParser('./tests/data/dirty.png') p = images.PNGParser('./tests/data/dirty.png')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment