Skip to content
Snippets Groups Projects
Commit c186fc42 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Clean deep metadata for zip files

parent 6d506b87
No related branches found
No related tags found
No related merge requests found
...@@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser): ...@@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser):
zipin.close() zipin.close()
return metadata return metadata
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.compress_type = zipfile.ZIP_DEFLATED
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
return zipinfo
def remove_all(self): def remove_all(self):
zin = zipfile.ZipFile(self.filename, 'r') zin = zipfile.ZipFile(self.filename, 'r')
zout = zipfile.ZipFile(self.output_filename, 'w') zout = zipfile.ZipFile(self.output_filename, 'w')
...@@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser): ...@@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser):
print("%s isn't supported" % item.filename) print("%s isn't supported" % item.filename)
continue continue
tmp_parser.remove_all() tmp_parser.remove_all()
zout.write(tmp_parser.output_filename, item.filename) zinfo = zipfile.ZipInfo(item.filename)
item = self.__clean_zipinfo(item)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
zout.close() zout.close()
zin.close() zin.close()
......
...@@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser): ...@@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser):
zipin.close() zipin.close()
return metadata return metadata
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.compress_type = zipfile.ZIP_DEFLATED
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
return zipinfo
def remove_all(self): def remove_all(self):
zin = zipfile.ZipFile(self.filename, 'r') zin = zipfile.ZipFile(self.filename, 'r')
zout = zipfile.ZipFile(self.output_filename, 'w') zout = zipfile.ZipFile(self.output_filename, 'w')
...@@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser): ...@@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser):
if not item.filename.endswith('.rels'): if not item.filename.endswith('.rels'):
continue # don't keep metadata files continue # don't keep metadata files
if item.filename in self.files_to_keep: if item.filename in self.files_to_keep:
item = self.__clean_zipinfo(item)
zout.writestr(item, zin.read(item)) zout.writestr(item, zin.read(item))
continue continue
...@@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser): ...@@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser):
print("%s isn't supported" % item.filename) print("%s isn't supported" % item.filename)
continue continue
tmp_parser.remove_all() tmp_parser.remove_all()
zout.write(tmp_parser.output_filename, item.filename) zinfo = zipfile.ZipInfo(item.filename)
item = self.__clean_zipinfo(item)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
zout.close() zout.close()
zin.close() zin.close()
......
...@@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase): ...@@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase):
class TestDeepCleaning(unittest.TestCase): class TestDeepCleaning(unittest.TestCase):
def __check_zip_clean(self, p): def __check_deep_meta(self, p):
tempdir = tempfile.mkdtemp() tempdir = tempfile.mkdtemp()
zipin = zipfile.ZipFile(p.filename) zipin = zipfile.ZipFile(p.filename)
zipin.extractall(tempdir) zipin.extractall(tempdir)
...@@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase): ...@@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase):
self.assertEqual(inside_p.get_meta(), {}) self.assertEqual(inside_p.get_meta(), {})
shutil.rmtree(tempdir) shutil.rmtree(tempdir)
def __check_zip_meta(self, p):
zipin = zipfile.ZipFile(p.filename)
for item in zipin.infolist():
self.assertEqual(item.comment, b'')
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
self.assertEqual(item.create_system, 3) # 3 is UNIX
def test_office(self): def test_office(self):
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx') shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
p = office.OfficeParser('./tests/data/clean.docx') p = office.OfficeParser('./tests/data/clean.docx')
...@@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase): ...@@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase):
p = office.OfficeParser('./tests/data/clean.docx.cleaned') p = office.OfficeParser('./tests/data/clean.docx.cleaned')
self.assertEqual(p.get_meta(), {}) self.assertEqual(p.get_meta(), {})
self.__check_zip_clean(p) self.__check_zip_meta(p)
self.__check_deep_meta(p)
os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.docx')
...@@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase): ...@@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase):
p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
self.assertEqual(p.get_meta(), {}) self.assertEqual(p.get_meta(), {})
self.__check_zip_clean(p) self.__check_zip_meta(p)
self.__check_deep_meta(p)
os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.odt')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment