Commit 7dad77a7 authored by jvoisin's avatar jvoisin
Browse files

Make the parsing of office format's metadata more robust

parent 8c7979aa
......@@ -78,8 +78,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
metadata[key] = value
try:
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results:
metadata[key] = value
except TypeError: # We didn't manage to parse the xml file
pass
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
......@@ -140,8 +144,12 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
for item in zipin.infolist():
if item.filename == 'meta.xml':
content = zipin.read(item).decode('utf-8')
for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
metadata[key] = value
try:
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results:
metadata[key] = value
except TypeError: # We didn't manage to parse the xml file
pass
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment