From 7dad77a7857990787551e99b2a09bdf908e67553 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 10 Jun 2018 20:20:00 +0200
Subject: [PATCH] Make the parsing of office format's metadata more robust

---
 libmat2/office.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/libmat2/office.py b/libmat2/office.py
index 914fd39..6ab7e80 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -78,8 +78,12 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         for item in zipin.infolist():
             if item.filename.startswith('docProps/') and item.filename.endswith('.xml'):
                 content = zipin.read(item).decode('utf-8')
-                for (key, value) in re.findall(r"<(.+)>(.+)</\1>", content, re.I):
-                    metadata[key] = value
+                try:
+                    results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M)
+                    for (key, value) in results:
+                        metadata[key] = value
+                except TypeError:  # We didn't manage to parse the xml file
+                    pass
                 if not metadata:  # better safe than sorry
                     metadata[item] = 'harmful content'
 
@@ -140,8 +144,12 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
         for item in zipin.infolist():
             if item.filename == 'meta.xml':
                 content = zipin.read(item).decode('utf-8')
-                for (key, value) in re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I):
-                    metadata[key] = value
+                try:
+                    results = re.findall(r"<((?:meta|dc|cp).+?)>(.+)</\1>", content, re.I|re.M)
+                    for (key, value) in results:
+                        metadata[key] = value
+                except TypeError:  # We didn't manage to parse the xml file
+                    pass
                 if not metadata:  # better safe than sorry
                     metadata[item] = 'harmful content'
             for key, value in self._get_zipinfo_meta(item).items():
-- 
GitLab