Skip to content
Snippets Groups Projects
Commit 177184ac authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Massively simplify how we're cleaning office files

parent f44769df
No related branches found
No related tags found
No related merge requests found
...@@ -47,45 +47,38 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -47,45 +47,38 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return metadata return metadata
def _clean_internal_file(self, item: zipfile.ZipInfo, temp_folder: str,
zin: zipfile.ZipFile, zout: zipfile.ZipFile) -> bool:
zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename)
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser:
zout.close()
os.remove(self.output_filename)
print("%s's format (%s) isn't supported" % (item.filename, mtype))
return False
tmp_parser.remove_all()
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
clean_zinfo = self._clean_zipinfo(zinfo)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(clean_zinfo, f.read())
return True
def remove_all(self) -> bool: def remove_all(self) -> bool:
zin = zipfile.ZipFile(self.filename, 'r') with zipfile.ZipFile(self.filename) as zin,\
zout = zipfile.ZipFile(self.output_filename, 'w') zipfile.ZipFile(self.output_filename, 'w') as zout:
temp_folder = tempfile.mkdtemp()
temp_folder = tempfile.mkdtemp()
for item in zin.infolist():
if item.filename[-1] == '/': # `is_dir` is added in Python3.6 for item in zin.infolist():
continue # don't keep empty folders if item.filename[-1] == '/': # `is_dir` is added in Python3.6
elif item.filename in self.files_to_keep: continue # don't keep empty folders
item = self._clean_zipinfo(item) elif item.filename in self.files_to_keep:
zout.writestr(item, zin.read(item)) item = self._clean_zipinfo(item)
continue zout.writestr(item, zin.read(item))
elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): continue
continue elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
elif not self._clean_internal_file(item, temp_folder, zin, zout): continue
return False
zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename)
tmp_parser, mtype = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser:
shutil.rmtree(temp_folder)
os.remove(self.output_filename)
print("%s's format (%s) isn't supported" % (item.filename, mtype))
return False
tmp_parser.remove_all()
zinfo = zipfile.ZipInfo(item.filename) # type: ignore
clean_zinfo = self._clean_zipinfo(zinfo)
with open(tmp_parser.output_filename, 'rb') as f:
zout.writestr(clean_zinfo, f.read())
shutil.rmtree(temp_folder) shutil.rmtree(temp_folder)
zout.close()
zin.close()
return True return True
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment