From 1ee936420ca1df1ebff14f19de28df5c41602b2b Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sat, 31 Mar 2018 20:56:15 +0200
Subject: [PATCH] Display docx metadata

---
 src/office.py         | 13 +++++++++++--
 tests/test_libmat2.py |  4 +++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/office.py b/src/office.py
index 2bdeec7..5de0597 100644
--- a/src/office.py
+++ b/src/office.py
@@ -1,3 +1,4 @@
+import re
 import subprocess
 import json
 import zipfile
@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
     files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
 
     def get_meta(self):
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
         metadata = {}
         zipin = zipfile.ZipFile(self.filename)
         for item in zipin.namelist():
-            if item.startswith('docProps/'):
-                metadata[item] = 'harmful content'
+            if item.startswith('docProps/') and item.endswith('.xml'):
+                content = zipin.read(item).decode('utf-8')
+                for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
+                    metadata[key] = value
+                if not metadata:  # better safe than sorry
+                    metadata[item] = 'harmful content'
         zipin.close()
         return metadata
 
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 02579b0..717de3f 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase):
     def test_docx(self):
         p = office.OfficeParser('./tests/data/dirty.docx')
         meta = p.get_meta()
-        print(meta)
+        self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
+        self.assertEqual(meta['dc:creator'], 'julien voisin')
+        self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
 
 
 class TestCleaning(unittest.TestCase):
-- 
GitLab