diff --git a/libmat2/archive.py b/libmat2/archive.py index f788ecc7f81b4e21289e56383d617ce4026ce68f..80e0bf2c63e4e55437e6fc79909605a4b10f227e 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -67,6 +67,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): return metadata + def get_meta(self) -> Dict[str, Union[str, dict]]: + meta = dict() # type: Dict[str, Union[str, dict]] + + with zipfile.ZipFile(self.filename) as zin: + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': # pragma: no cover + # `is_dir` is added in Python3.6 + continue # don't keep empty folders + + zin.extract(member=item, path=temp_folder) + full_path = os.path.join(temp_folder, item.filename) + + tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore + if not tmp_parser: + continue + + local_meta = tmp_parser.get_meta() + if local_meta: + meta[item.filename] = local_meta + + shutil.rmtree(temp_folder) + return meta + def remove_all(self) -> bool: # pylint: disable=too-many-branches diff --git a/libmat2/office.py b/libmat2/office.py index c10664f9c6472b6caf048edd46fab157a13cd3fb..e6370e7ecbe69462ed9ecc3cb28a9c868b4e6ba5 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -301,7 +301,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): Yes, I know that parsing xml with regexp ain't pretty, be my guest and fix it if you want. """ - metadata = {} + metadata = super().get_meta() zipin = zipfile.ZipFile(self.filename) for item in zipin.infolist(): if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 03db6c5befeadda45aa43a167247ace4ad451289..846612746acec6dfdebc4f9f5c97f32851d4fd9c 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -36,6 +36,7 @@ class TestZipMetadata(unittest.TestCase): meta = p.get_meta() self.assertIsNotNone(meta) + self.assertEqual(meta['word/media/image1.png']['Comment'], 'This is a comment, be careful!') ret = p.remove_all() self.assertTrue(ret)