diff --git a/libmat2/office.py b/libmat2/office.py index bad352ba9f68af6283cf3b4a57654327d36ed370..b22009281d37817a263db2fe7390dcf081b58ac6 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser): return True + def __remove_content_type_members(self, full_path: str) -> bool: + """ The method will remove the dangling references + form the [Content_Types].xml file, since MS office doesn't like them + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError: # pragma: no cover + return False + + if len(namespace.items()) != 1: + return False # there should be only one namespace for Types + + removed_fnames = set() + with zipfile.ZipFile(self.filename) as zin: + for fname in [item.filename for item in zin.infolist()]: + if any(map(lambda r: r.search(fname), self.files_to_omit)): + removed_fnames.add(fname) + + root = tree.getroot() + for item in root.findall('{%s}Override' % namespace['']): + name = item.attrib['PartName'][1:] # remove the leading '/' + if name in removed_fnames: + root.remove(item) + + tree.write(full_path, xml_declaration=True) + + return True + def _specific_cleanup(self, full_path: str) -> bool: if os.stat(full_path).st_size == 0: # Don't process empty files return True + if full_path.endswith('/[Content_Types].xml'): + # this file contains references to files that we might + # remove, and MS Office doesn't like dangling references + if self.__remove_content_type_members(full_path) is False: + return False + if full_path.endswith('/word/document.xml'): # this file contains the revisions if self.__remove_revisions(full_path) is False: diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx new file mode 100644 index 0000000000000000000000000000000000000000..43ac7437618f8f49e52c2006526efa087cb0c011 Binary files /dev/null and b/tests/data/malformed_content_types.docx differ diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 30039e684d54a92d46622b70c181651306e14f94..5af0e811b2a9b513f339b0151e87b918bc3ee042 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): os.remove('./tests/data/clean.py') +class TestCorruptedContentTypesOffice(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + self.assertIsNotNone(p) + self.assertFalse(p.remove_all()) + os.remove('./tests/data/clean.docx') + class TestCorruptedFiles(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')