Display docx metadata

1ee93642 · Julien (jvoisin) Voisin · e4d2506d · 1ee93642 · 1ee93642
Commit 1ee93642 authored 7 years ago by Julien (jvoisin) Voisin
--- a/src/office.py
+++ b/src/office.py
+import re
 import subprocess
 import json
 import zipfile
@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
    files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
    def get_meta(self):
+        """
+        Yes, I know that parsing xml with regexp ain't pretty,
+        be my guest and fix it if you want.
+        """
        metadata = {}
        zipin = zipfile.ZipFile(self.filename)
        for item in zipin.namelist():
-            if item.startswith('docProps/'):
+            if item.startswith('docProps/') and item.endswith('.xml'):
-                metadata[item] = 'harmful content'
+                content = zipin.read(item).decode('utf-8')
+                for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
+                    metadata[key] = value
+                if not metadata:  # better safe than sorry
+                    metadata[item] = 'harmful content'
        zipin.close()
        return metadata

--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase):
    def test_docx(self):
        p = office.OfficeParser('./tests/data/dirty.docx')
        meta = p.get_meta()
-        print(meta)
+        self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
+        self.assertEqual(meta['dc:creator'], 'julien voisin')
+        self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
 class TestCleaning(unittest.TestCase):