Skip to content
Snippets Groups Projects
Commit 7dad77a7 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Make the parsing of office format's metadata more robust

parent 8c7979aa
No related branches found
No related tags found
No related merge requests found
......@@ -78,8 +78,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
metadata[key] = value
try:
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results:
metadata[key] = value
except TypeError: # We didn't manage to parse the xml file
pass
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
......@@ -140,8 +144,12 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
for item in zipin.infolist():
if item.filename == 'meta.xml':
content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
metadata[key] = value
try:
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results:
metadata[key] = value
except TypeError: # We didn't manage to parse the xml file
pass
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment