Skip to content
Snippets Groups Projects
Commit c186fc42 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Clean deep metadata for zip files

parent 6d506b87
No related branches found
No related tags found
No related merge requests found
......@@ -34,6 +34,13 @@ class LibreOfficeParser(abstract.AbstractParser):
zipin.close()
return metadata
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.compress_type = zipfile.ZIP_DEFLATED
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
return zipinfo
def remove_all(self):
zin = zipfile.ZipFile(self.filename, 'r')
zout = zipfile.ZipFile(self.output_filename, 'w')
......@@ -51,7 +58,10 @@ class LibreOfficeParser(abstract.AbstractParser):
print("%s isn't supported" % item.filename)
continue
tmp_parser.remove_all()
zout.write(tmp_parser.output_filename, item.filename)
zinfo = zipfile.ZipInfo(item.filename)
item = self.__clean_zipinfo(item)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(zinfo, f.read())
shutil.rmtree(temp_folder)
zout.close()
zin.close()
......
......@@ -33,6 +33,13 @@ class OfficeParser(abstract.AbstractParser):
zipin.close()
return metadata
def __clean_zipinfo(self, zipinfo:zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.compress_type = zipfile.ZIP_DEFLATED
zipinfo.create_system = 3 # Linux
zipinfo.comment = b''
zipinfo.date_time = (1980, 1, 1, 0, 0, 0)
return zipinfo
def remove_all(self):
zin = zipfile.ZipFile(self.filename, 'r')
zout = zipfile.ZipFile(self.output_filename, 'w')
......@@ -45,6 +52,7 @@ class OfficeParser(abstract.AbstractParser):
if not item.filename.endswith('.rels'):
continue # don't keep metadata files
if item.filename in self.files_to_keep:
item = self.__clean_zipinfo(item)
zout.writestr(item, zin.read(item))
continue
......@@ -54,7 +62,11 @@ class OfficeParser(abstract.AbstractParser):
print("%s isn't supported" % item.filename)
continue
tmp_parser.remove_all()
zout.write(tmp_parser.output_filename, item.filename)
zinfo = zipfile.ZipInfo(item.filename)
item = self.__clean_zipinfo(item)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(zinfo, f.read())
shutil.rmtree(temp_folder)
zout.close()
zin.close()
......
......@@ -57,7 +57,7 @@ class TestGetMeta(unittest.TestCase):
class TestDeepCleaning(unittest.TestCase):
def __check_zip_clean(self, p):
def __check_deep_meta(self, p):
tempdir = tempfile.mkdtemp()
zipin = zipfile.ZipFile(p.filename)
zipin.extractall(tempdir)
......@@ -72,6 +72,15 @@ class TestDeepCleaning(unittest.TestCase):
self.assertEqual(inside_p.get_meta(), {})
shutil.rmtree(tempdir)
def __check_zip_meta(self, p):
zipin = zipfile.ZipFile(p.filename)
for item in zipin.infolist():
self.assertEqual(item.comment, b'')
self.assertEqual(item.date_time, (1980, 1, 1, 0, 0, 0))
self.assertEqual(item.create_system, 3) # 3 is UNIX
def test_office(self):
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
p = office.OfficeParser('./tests/data/clean.docx')
......@@ -85,7 +94,8 @@ class TestDeepCleaning(unittest.TestCase):
p = office.OfficeParser('./tests/data/clean.docx.cleaned')
self.assertEqual(p.get_meta(), {})
self.__check_zip_clean(p)
self.__check_zip_meta(p)
self.__check_deep_meta(p)
os.remove('./tests/data/clean.docx')
......@@ -103,7 +113,8 @@ class TestDeepCleaning(unittest.TestCase):
p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned')
self.assertEqual(p.get_meta(), {})
self.__check_zip_clean(p)
self.__check_zip_meta(p)
self.__check_deep_meta(p)
os.remove('./tests/data/clean.odt')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment