Commit 5e78c0e0 authored by jvoisin's avatar jvoisin

Implement lightweight mode of msoffice files

parent 4ed30b5e
Pipeline #19438 failed with stages
in 4 minutes and 16 seconds
......@@ -12,13 +12,14 @@ class AbstractParser(abc.ABC):
meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str]
def __init__(self, filename: str) -> None:
def __init__(self, filename: str, lightweight_cleaning: bool=False) -> None:
"""
:raises ValueError: Raised upon an invalid file
"""
self.filename = filename
fname, extension = os.path.splitext(filename)
self.output_filename = fname + '.cleaned' + extension
self.lightweight_cleaning = lightweight_cleaning
@abc.abstractmethod
def get_meta(self) -> Dict[str, str]:
......@@ -27,10 +28,3 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod
def remove_all(self) -> bool:
pass # pragma: no cover
def remove_all_lightweight(self) -> bool:
""" This method removes _SOME_ metadata.
It might be useful to implement it for fileformats that do
not support non-destructive cleaning.
"""
return self.remove_all()
......@@ -267,8 +267,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
f.write(b'</cp:coreProperties>')
if self.__remove_rsid(full_path) is False:
return False
if self.lightweight_cleaning is False:
if self.__remove_rsid(full_path) is False:
return False
try:
_sort_xml_attributes(full_path)
......
......@@ -37,7 +37,7 @@ class PDFParser(abstract.AbstractParser):
except GLib.GError: # Invalid PDF
raise ValueError
def remove_all_lightweight(self):
def __remove_all_ligthweight(self) -> bool:
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
......@@ -65,6 +65,11 @@ class PDFParser(abstract.AbstractParser):
return True
def remove_all(self):
if self.lightweight_cleaning is True:
return self.__remove_all_ligthweight()
return self.__remove_all_complete()
def __remove_all_complete(self) -> bool:
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF.
......
......@@ -87,8 +87,7 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
print("[-] %s's format (%s) is not supported" % (filename, mtype))
return False
p.unknown_member_policy = policy
if is_lightweight:
return p.remove_all_lightweight()
p.lightwith_cleaning = is_lightweight
return p.remove_all()
......
......@@ -182,6 +182,25 @@ class TestRevisionsCleaning(unittest.TestCase):
os.remove('./tests/data/revision_clean.cleaned.docx')
class TestLightWeightCleaning(unittest.TestCase):
def test_msoffice(self):
shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
p = office.MSOfficeParser('./tests/data/clean.docx')
meta = p.get_meta()
self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
# FIXME blocked by https://0xacab.org/jvoisin/mat2/issues/73
#p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
#expected_meta = {'creation-date': -1, 'format': 'docx-1.5', 'mod-date': -1}
#self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.docx')
os.remove('./tests/data/clean.cleaned.docx')
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
......@@ -189,7 +208,8 @@ class TestLightWeightCleaning(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
ret = p.remove_all_lightweight()
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
......@@ -206,7 +226,8 @@ class TestLightWeightCleaning(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
ret = p.remove_all_lightweight()
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment