Skip to content
Snippets Groups Projects
Commit ad3e7cce authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Bump coverage for office files and fix some related crashes

parent ca014841
No related branches found
No related tags found
No related merge requests found
...@@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -147,7 +147,10 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
""" In this function, we're changing the XML """ In this function, we're changing the XML
document in two times, since we don't want document in two times, since we don't want
to change the tree we're iterating on.""" to change the tree we're iterating on."""
tree, ns = _parse_xml(full_path) try:
tree, ns = _parse_xml(full_path)
except ET.ParseError:
return False
# No revisions are present # No revisions are present
del_presence = tree.find('.//w:del', ns) del_presence = tree.find('.//w:del', ns)
...@@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -191,15 +194,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
zipin = zipfile.ZipFile(self.filename) zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist(): for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
content = zipin.read(item).decode('utf-8')
try: try:
content = zipin.read(item).decode('utf-8')
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results: for (key, value) in results:
metadata[key] = value metadata[key] = value
except TypeError: # We didn't manage to parse the xml file except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
pass metadata[item.filename] = 'harmful content'
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items(): for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value metadata[key] = value
zipin.close() zipin.close()
...@@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -232,7 +233,10 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
def __remove_revisions(self, full_path: str) -> bool: def __remove_revisions(self, full_path: str) -> bool:
tree, ns = _parse_xml(full_path) try:
tree, ns = _parse_xml(full_path)
except ET.ParseError:
return False
if 'office' not in ns.keys(): # no revisions in the current file if 'office' not in ns.keys(): # no revisions in the current file
return True return True
...@@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): ...@@ -259,15 +263,13 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
zipin = zipfile.ZipFile(self.filename) zipin = zipfile.ZipFile(self.filename)
for item in zipin.infolist(): for item in zipin.infolist():
if item.filename == 'meta.xml': if item.filename == 'meta.xml':
content = zipin.read(item).decode('utf-8')
try: try:
content = zipin.read(item).decode('utf-8')
results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M) results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
for (key, value) in results: for (key, value) in results:
metadata[key] = value metadata[key] = value
except TypeError: # We didn't manage to parse the xml file except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
pass metadata[item.filename] = 'harmful content'
if not metadata: # better safe than sorry
metadata[item] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items(): for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value metadata[key] = value
zipin.close() zipin.close()
......
File added
File added
...@@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase): ...@@ -15,6 +15,21 @@ class TestUnsupportedFiles(unittest.TestCase):
self.assertEqual(parser, None) self.assertEqual(parser, None)
os.remove('./tests/clean.py') os.remove('./tests/clean.py')
class TestCorruptedEmbedded(unittest.TestCase):
def test_docx(self):
shutil.copy('./tests/data/embedded_corrupted.docx', './tests/data/clean.docx')
parser, mimetype = parser_factory.get_parser('./tests/data/clean.docx')
self.assertFalse(parser.remove_all())
self.assertIsNotNone(parser.get_meta())
os.remove('./tests/data/clean.docx')
def test_odt(self):
shutil.copy('./tests/data/embedded_corrupted.odt', './tests/data/clean.odt')
parser, mimetype = parser_factory.get_parser('./tests/data/clean.odt')
self.assertFalse(parser.remove_all())
self.assertEqual(parser.get_meta(), {'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content'})
os.remove('./tests/data/clean.odt')
class TestExplicitelyUnsupportedFiles(unittest.TestCase): class TestExplicitelyUnsupportedFiles(unittest.TestCase):
def test_pdf(self): def test_pdf(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment