Skip to content
Snippets Groups Projects
Commit 1ee93642 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Display docx metadata

parent e4d2506d
No related branches found
No related tags found
No related merge requests found
import re
import subprocess
import json
import zipfile
......@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
def get_meta(self):
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
"""
metadata = {}
zipin = zipfile.ZipFile(self.filename)
for item in zipin.namelist():
if item.startswith('docProps/'):
metadata[item] = 'harmful content'
if item.startswith('docProps/') and item.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
metadata[key] = value
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
zipin.close()
return metadata
......
......@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase):
def test_docx(self):
p = office.OfficeParser('./tests/data/dirty.docx')
meta = p.get_meta()
print(meta)
self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
self.assertEqual(meta['dc:creator'], 'julien voisin')
self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
class TestCleaning(unittest.TestCase):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment