diff --git a/libmat2/office.py b/libmat2/office.py index ee6cf916954c94dc279c3d95789cd11e1314d2bc..db4b3e3db35bd92d21d4f82ad51a27dbe55f94fd 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -300,18 +300,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - if not file_path.startswith('docProps/'): - return {} - elif not file_path.endswith('.xml'): + if not file_path.startswith('docProps/') and not file_path.endswith('.xml'): return {} with open(full_path, encoding='utf-8') as f: try: results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M) - metadata = {} - for (key, value) in results: - metadata[key] = value - return metadata + return {k:v for (k, v) in results} except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file return {file_path: 'harmful content', }