diff --git a/README.md b/README.md index 5f902feaf056214ea971b21637f16fbf813b48b1..c81daffb50ba23fa5891509c22d9e4e00d60b45b 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx +The `narrated_powerpoint_presentation.pptx` file is in the public domain. + # Thanks mat2 wouldn't exist without: diff --git a/libmat2/office.py b/libmat2/office.py index 369ae9e8a1fe3461cca4a268e64f03f943ed64fc..2da37cdab9b64256ec06aaa3a1afddb71d6e3757 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -160,7 +160,7 @@ class MSOfficeParser(ZipParser): """ try: tree, namespace = _parse_xml(full_path) - except ET.ParseError as e: + except ET.ParseError as e: # pragma: no cover logging.error("Unable to parse %s: %s", full_path, e) return False @@ -220,7 +220,7 @@ class MSOfficeParser(ZipParser): def __remove_revisions(full_path: str) -> bool: try: tree, namespace = _parse_xml(full_path) - except ET.ParseError as e: + except ET.ParseError as e: # pragma: no cover logging.error("Unable to parse %s: %s", full_path, e) return False @@ -299,7 +299,7 @@ class MSOfficeParser(ZipParser): """ MSOffice documents are using various counters for cross-references, we collect them all, to make sure that they're effectively counters, and not unique id used for fingerprinting.""" - with open(full_path) as f: + with open(full_path, encoding='utf-8') as f: content = f.read() # relationship id for i in re.findall(r'(?:\s|r:)[iIdD]="rId([0-9]+)"(?:\s|/)', content): diff --git a/tests/data/narrated_powerpoint_presentation.pptx b/tests/data/narrated_powerpoint_presentation.pptx new file mode 100644 index 0000000000000000000000000000000000000000..ef041324b47a69ab5d3315687e57e21d789e4875 Binary files /dev/null and b/tests/data/narrated_powerpoint_presentation.pptx differ diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 9e208ecd1693b5e589ca890c8b467623be7a2a51..30552daa7a719104923a7a9c93ab19c03763dc04 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -777,3 +777,16 @@ class TestNoSandbox(unittest.TestCase): os.remove('./tests/data/clean.png') os.remove('./tests/data/clean.cleaned.png') os.remove('./tests/data/clean.cleaned.cleaned.png') + +class TestComplexOfficeFiles(unittest.TestCase): + def test_complex_pptx(self): + target = './tests/data/clean.pptx' + shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target) + p = office.MSOfficeParser(target) + self.assertTrue(p.remove_all()) + + os.remove(target) + os.remove(p.output_filename) + + +