diff --git a/src/office.py b/src/office.py index 2bdeec7adb7489aad2f898b79aea41f3176af372..5de05976d2410cb53f0a5aed379aa28571920252 100644 --- a/src/office.py +++ b/src/office.py @@ -1,3 +1,4 @@ +import re import subprocess import json import zipfile @@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser): files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ metadata = {} zipin = zipfile.ZipFile(self.filename) for item in zipin.namelist(): - if item.startswith('docProps/'): - metadata[item] = 'harmful content' + if item.startswith('docProps/') and item.endswith('.xml'): + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' zipin.close() return metadata diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 02579b073b83ee5e708c0c229d7486bda42daf25..717de3fc4457354c07e1a7a47cfedf6a65a37f47 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase): def test_docx(self): p = office.OfficeParser('./tests/data/dirty.docx') meta = p.get_meta() - print(meta) + self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') + self.assertEqual(meta['dc:creator'], 'julien voisin') + self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') class TestCleaning(unittest.TestCase):