Skip to content
Snippets Groups Projects
Commit 1ee93642 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Display docx metadata

parent e4d2506d
Branches
Tags
No related merge requests found
import re
import subprocess import subprocess
import json import json
import zipfile import zipfile
...@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser): ...@@ -16,11 +17,19 @@ class OfficeParser(abstract.AbstractParser):
files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'} files_to_keep = {'_rels/.rels', 'word/_rels/document.xml.rels'}
def get_meta(self): def get_meta(self):
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
"""
metadata = {} metadata = {}
zipin = zipfile.ZipFile(self.filename) zipin = zipfile.ZipFile(self.filename)
for item in zipin.namelist(): for item in zipin.namelist():
if item.startswith('docProps/'): if item.startswith('docProps/') and item.endswith('.xml'):
metadata[item] = 'harmful content' content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
metadata[key] = value
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
zipin.close() zipin.close()
return metadata return metadata
......
...@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase): ...@@ -42,7 +42,9 @@ class TestGetMeta(unittest.TestCase):
def test_docx(self): def test_docx(self):
p = office.OfficeParser('./tests/data/dirty.docx') p = office.OfficeParser('./tests/data/dirty.docx')
meta = p.get_meta() meta = p.get_meta()
print(meta) self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
self.assertEqual(meta['dc:creator'], 'julien voisin')
self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
class TestCleaning(unittest.TestCase): class TestCleaning(unittest.TestCase):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment