diff --git a/libmat2/archive.py b/libmat2/archive.py index 016142d74db1b5dbc7a3206cb00f5a8c9c4b15e8..205312b8863f95bfe5fdd98dd29af29c01450dff 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -2,6 +2,7 @@ import zipfile import datetime import tempfile import os +import sys import logging import shutil from typing import Dict, Set, Pattern @@ -49,10 +50,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): return zipinfo @staticmethod - def _get_zipinfo_meta(zipinfo: zipfile.ZipInfo) -> Dict[str, str]: + def _get_zipinfo_meta(self, zipinfo: zipfile.ZipInfo) -> Dict[str, str]: metadata = {} if zipinfo.create_system == 3: # this is Linux - pass + metadata['create_system'] = 'Linux' elif zipinfo.create_system == 2: metadata['create_system'] = 'Windows' else: @@ -64,11 +65,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): if zipinfo.date_time != (1980, 1, 1, 0, 0, 0): metadata['date_time'] = str(datetime.datetime(*zipinfo.date_time)) + ret = self._parse_files() + metadata_files = ret[0] + for name, _ in metadata_files.items(): + metadata[name] = metadata_files return metadata - def remove_all(self) -> bool: - # pylint: disable=too-many-branches - + def _parse_files(self) -> tuple: + metadata = {} # type: dict + caller = sys._getframe(1).f_code.co_name with zipfile.ZipFile(self.filename) as zin,\ zipfile.ZipFile(self.output_filename, 'w') as zout: @@ -84,11 +89,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): zin.extract(member=item, path=temp_folder) full_path = os.path.join(temp_folder, item.filename) - if self._specific_cleanup(full_path) is False: - logging.warning("Something went wrong during deep cleaning of %s", - item.filename) - abort = True - continue + if caller == "remove_all": + if self._specific_cleanup(full_path) is False: + logging.warning("Something went wrong during deep cleaning of %s", + item.filename) + abort = True + continue if any(map(lambda r: r.search(item.filename), self.files_to_keep)): # those files aren't supported, but we want to add them anyway @@ -112,8 +118,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): abort = True continue if tmp_parser: - tmp_parser.remove_all() - os.rename(tmp_parser.output_filename, full_path) + if caller == "remove_all": + tmp_parser.remove_all() + os.rename(tmp_parser.output_filename, full_path) + else: + metadata[item.filename] = tmp_parser.get_meta() zinfo = zipfile.ZipInfo(item.filename) # type: ignore clean_zinfo = self._clean_zipinfo(zinfo) @@ -123,5 +132,21 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): shutil.rmtree(temp_folder) if abort: os.remove(self.output_filename) - return False - return True + return metadata, abort + # pylint: disable=too-many-branches + + def remove_all(self) -> bool: + ret = self._parse_files() + return not ret[1] + +class ZIPParser(ArchiveBasedAbstractParser): + mimetypes = {'application/zip'} + + def get_meta(self) -> Dict[str, str]: + metadata = {} + zipin = zipfile.ZipFile(self.filename) + for item in zipin.infolist(): + for key, value in self._get_zipinfo_meta(self, item).items(): + metadata[key] = value + zipin.close() + return metadata diff --git a/libmat2/office.py b/libmat2/office.py index 32e7b7576a4f985b021410963860306b78820a55..73d5d9ac95a0b870b397addaf80ac92f9d056389 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser): metadata[key] = value except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file metadata[item.filename] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): + for key, value in self._get_zipinfo_meta(self, item).items(): metadata[key] = value zipin.close() return metadata @@ -397,7 +397,7 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): metadata[key] = value except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file metadata[item.filename] = 'harmful content' - for key, value in self._get_zipinfo_meta(item).items(): + for key, value in self._get_zipinfo_meta(self, item).items(): metadata[key] = value zipin.close() return metadata diff --git a/mat2 b/mat2 index 987e439e974122c1eb16c9b2009dfe2691441ae5..9a63c6d6d1d7f199ee37865129d5cdf42a6939f7 100755 --- a/mat2 +++ b/mat2 @@ -73,10 +73,22 @@ def show_meta(filename: str): return for k, v in metadata: + zipmeta = v try: # FIXME this is ugly. - print(" %s: %s" % (k, v)) + if not isinstance(zipmeta, dict): + print(" %s: %s" % (k, v)) except UnicodeEncodeError: print(" %s: harmful content" % k) + if mtype == "application/zip": + print("[+] Metadata for files inside the archive :") + if isinstance(zipmeta, dict): + for name, metas in zipmeta.items(): + try: # FIXME this is ugly. + print(" %s" % name) + for meta_name, meta in metas.items(): + print(" %s: %s" % (meta_name, meta)) + except UnicodeEncodeError: + print(" %s: harmful content" % k) def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) -> bool: if not __check_file(filename, os.R_OK|os.W_OK): diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 8d7c252529e6b905cc49706f81c14a49ccc2f0d5..f175470cb065cbba9fea65aa8f864850527d6ddf 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -65,7 +65,10 @@ class TestCorruptedEmbedded(unittest.TestCase): os.remove('./tests/data/clean.docx') def test_odt(self): - expected = { + expected = {'Pictures/100002010000021D0000039CFEBF39BEE21A25FB.png': + {'PixelUnits': 'meters', + 'PixelsPerUnitX': 341, + 'PixelsPerUnitY': 341}, 'create_system': 'Weird', 'date_time': '2018-06-10 17:18:18', 'meta.xml': 'harmful content' diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 03db6c5befeadda45aa43a167247ace4ad451289..88415ffb83bc137ef5a0df4a2af91f907a83a687 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -41,7 +41,7 @@ class TestZipMetadata(unittest.TestCase): self.assertTrue(ret) p = office.MSOfficeParser('./tests/data/clean.cleaned.docx') - self.assertEqual(p.get_meta(), {}) + self.assertEqual(p.get_meta(), {'create_system': 'Linux', 'word/media/image1.png': {'word/media/image1.png': {}}}) self.__check_zip_meta(p) self.__check_deep_meta(p) @@ -60,7 +60,7 @@ class TestZipMetadata(unittest.TestCase): self.assertTrue(ret) p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt') - self.assertEqual(p.get_meta(), {}) + self.assertEqual(p.get_meta(), {'Pictures/1000000000000032000000311EC5314D.png': {'Pictures/1000000000000032000000311EC5314D.png': {}}, 'create_system': 'Linux'}) self.__check_zip_meta(p) self.__check_deep_meta(p) diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 957503d12a09dccb7fdb75995da9b699b3601dd4..17f709313760bed14371b0b75c99dd3147f8a4fe 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -336,13 +336,30 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = office.MSOfficeParser('./tests/data/clean.cleaned.docx') - self.assertEqual(p.get_meta(), {}) - self.assertTrue(p.remove_all()) + self.assertEqual(p.get_meta(), {'create_system': 'Linux', 'word/media/image1.png': {'word/media/image1.png': {}}}) os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx') os.remove('./tests/data/clean.cleaned.cleaned.docx') + def test_zip(self): + shutil.copy('./tests/data/test.zip', './tests/data/clean.zip') + p = office.MSOfficeParser('./tests/data/clean.zip') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + ret = p.remove_all() + self.assertTrue(ret) + + p = office.MSOfficeParser('./tests/data/clean.cleaned.zip') + self.assertEqual(p.get_meta(), {'create_system': 'Linux', + 'dirty.mp3': {'dirty.mp3': {}, 'dirty.png': {}}, + 'dirty.png': {'dirty.mp3': {}, 'dirty.png': {}}}) + + os.remove('./tests/data/clean.zip') + os.remove('./tests/data/clean.cleaned.zip') + def test_libreoffice(self): shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') p = office.LibreOfficeParser('./tests/data/clean.odt') @@ -354,7 +371,7 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = office.LibreOfficeParser('./tests/data/clean.cleaned.odt') - self.assertEqual(p.get_meta(), {}) + self.assertEqual(p.get_meta(), {'Pictures/1000000000000032000000311EC5314D.png': {'Pictures/1000000000000032000000311EC5314D.png': {}}, 'create_system': 'Linux'}) self.assertTrue(p.remove_all()) os.remove('./tests/data/clean.odt') @@ -426,8 +443,8 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = office.LibreOfficeParser('./tests/data/clean.cleaned.odf') - self.assertEqual(p.get_meta(), {}) self.assertTrue(p.remove_all()) + self.assertEqual(p.get_meta(), {'create_system': 'Linux'}) os.remove('./tests/data/clean.odf') os.remove('./tests/data/clean.cleaned.odf') @@ -444,8 +461,7 @@ class TestCleaning(unittest.TestCase): self.assertTrue(ret) p = office.LibreOfficeParser('./tests/data/clean.cleaned.odg') - self.assertEqual(p.get_meta(), {}) - self.assertTrue(p.remove_all()) + self.assertEqual(p.get_meta(), {'create_system': 'Linux'}) os.remove('./tests/data/clean.odg') os.remove('./tests/data/clean.cleaned.odg')