From c186fc42929b2660e5c507adeb8a8fb406593b11 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 1 Apr 2018 00:17:06 +0200
Subject: [PATCH] Clean deep metadata for zip files

---
 src/libreoffice.py    | 12 +++++++++++-
 src/office.py         | 14 +++++++++++++-
 tests/test_libmat2.py | 17 ++++++++++++++---
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/libreoffice.py b/src/libreoffice.py
index a3481a1..809ae3c 100644
--- a/src/libreoffice.py
+++ b/src/libreoffice.py
@@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser):
         zipin.close()
         return metadata
 
+    def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
+        zipinfo.compress_type = zipfile.ZIP_DEFLATED
+        zipinfo.create_system = 3  # Linux
+        zipinfo.comment = b''
+        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
+        return zipinfo
+
     def remove_all(self):
         zin = zipfile.ZipFile(self.filename, 'r')
         zout = zipfile.ZipFile(self.output_filename, 'w')
@@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser):
                 print("%s isn't supported" % item.filename)
                 continue
             tmp_parser.remove_all()
-            zout.write(tmp_parser.output_filename, item.filename)
+            zinfo = zipfile.ZipInfo(item.filename)
+            item = self.__clean_zipinfo(item)
+            with open(tmp_parser.output_filename, 'rb') as f:
+                zout.writestr(zinfo, f.read())
         shutil.rmtree(temp_folder)
         zout.close()
         zin.close()
diff --git a/src/office.py b/src/office.py
index 5de0597..a729f2f 100644
--- a/src/office.py
+++ b/src/office.py
@@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser):
         zipin.close()
         return metadata
 
+    def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
+        zipinfo.compress_type = zipfile.ZIP_DEFLATED
+        zipinfo.create_system = 3  # Linux
+        zipinfo.comment = b''
+        zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
+        return zipinfo
+
     def remove_all(self):
         zin = zipfile.ZipFile(self.filename, 'r')
         zout = zipfile.ZipFile(self.output_filename, 'w')
@@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser):
                 if not item.filename.endswith('.rels'):
                     continue  # don't keep metadata files
             if item.filename in self.files_to_keep:
+                item = self.__clean_zipinfo(item)
                 zout.writestr(item, zin.read(item))
                 continue
 
@@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser):
                 print("%s isn't supported" % item.filename)
                 continue
             tmp_parser.remove_all()
-            zout.write(tmp_parser.output_filename, item.filename)
+            zinfo = zipfile.ZipInfo(item.filename)
+            item = self.__clean_zipinfo(item)
+            with open(tmp_parser.output_filename, 'rb') as f:
+                zout.writestr(zinfo, f.read())
+
         shutil.rmtree(temp_folder)
         zout.close()
         zin.close()
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index c065237..888c782 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase):
 
 
 class TestDeepCleaning(unittest.TestCase):
-    def __check_zip_clean(self, p):
+    def __check_deep_meta(self, p):
         tempdir = tempfile.mkdtemp()
         zipin = zipfile.ZipFile(p.filename)
         zipin.extractall(tempdir)
@@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase):
                 self.assertEqual(inside_p.get_meta(), {})
         shutil.rmtree(tempdir)
 
+
+    def __check_zip_meta(self, p):
+        zipin = zipfile.ZipFile(p.filename)
+        for item in zipin.infolist():
+            self.assertEqual(item.comment, b'')
+            self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
+            self.assertEqual(item.create_system, 3)  # 3 is UNIX
+
+
     def test_office(self):
         shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
         p = office.OfficeParser('./tests/data/clean.docx')
@@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase):
         p = office.OfficeParser('./tests/data/clean.docx.cleaned')
         self.assertEqual(p.get_meta(), {})
 
-        self.__check_zip_clean(p)
+        self.__check_zip_meta(p)
+        self.__check_deep_meta(p)
 
         os.remove('./tests/data/clean.docx')
 
@@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase):
         p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
         self.assertEqual(p.get_meta(), {})
 
-        self.__check_zip_clean(p)
+        self.__check_zip_meta(p)
+        self.__check_deep_meta(p)
 
         os.remove('./tests/data/clean.odt')
 
-- 
GitLab