Skip to content
Snippets Groups Projects
Commit 1ee93642 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Display docx metadata

parent e4d2506d
Branches
Tags
No related merge requests found
import re
import subprocess
import json
import zipfile
......@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
def get_meta(self):
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
"""
metadata = {}
zipin = zipfile.ZipFile(self.filename)
for item in zipin.namelist():
if item.startswith('docProps/'):
metadata[item] = 'harmful content'
if item.startswith('docProps/') and item.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
metadata[key] = value
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
zipin.close()
return metadata
......
......@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase):
def test_docx(self):
p = office.OfficeParser('./tests/data/dirty.docx')
meta = p.get_meta()
print(meta)
self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
self.assertEqual(meta['dc:creator'], 'julien voisin')
self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
class TestCleaning(unittest.TestCase):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment