From 5e78c0e036375a1db3da88a1546abe968db33dad Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Tue, 9 Oct 2018 17:08:57 +0200
Subject: [PATCH] Implement lightweight mode of msoffice files

---
 libmat2/abstract.py   | 10 ++--------
 libmat2/office.py     |  5 +++--
 libmat2/pdf.py        |  7 ++++++-
 mat2                  |  3 +--
 tests/test_libmat2.py | 25 +++++++++++++++++++++++--
 5 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/libmat2/abstract.py b/libmat2/abstract.py
index cd72f2c..889c6c8 100644
--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
@@ -12,13 +12,14 @@ class AbstractParser(abc.ABC):
     meta_list = set()  # type: Set[str]
     mimetypes = set()  # type: Set[str]
 
-    def __init__(self, filename: str) -> None:
+    def __init__(self, filename: str, lightweight_cleaning: bool=False) -> None:
         """
         :raises ValueError: Raised upon an invalid file
         """
         self.filename = filename
         fname, extension = os.path.splitext(filename)
         self.output_filename = fname + '.cleaned' + extension
+        self.lightweight_cleaning = lightweight_cleaning
 
     @abc.abstractmethod
     def get_meta(self) -> Dict[str, str]:
@@ -27,10 +28,3 @@ class AbstractParser(abc.ABC):
     @abc.abstractmethod
     def remove_all(self) -> bool:
         pass  # pragma: no cover
-
-    def remove_all_lightweight(self) -> bool:
-        """ This method removes _SOME_ metadata.
-        It might be useful to implement it for fileformats that do
-        not support non-destructive cleaning.
-        """
-        return self.remove_all()
diff --git a/libmat2/office.py b/libmat2/office.py
index 32e7b75..afb5e33 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -267,8 +267,9 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
                 f.write(b'</cp:coreProperties>')
 
 
-        if self.__remove_rsid(full_path) is False:
-            return False
+        if self.lightweight_cleaning is False:
+            if self.__remove_rsid(full_path) is False:
+                return False
 
         try:
             _sort_xml_attributes(full_path)
diff --git a/libmat2/pdf.py b/libmat2/pdf.py
index c8769aa..b6b782e 100644
--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -37,7 +37,7 @@ class PDFParser(abstract.AbstractParser):
         except GLib.GError:  # Invalid PDF
             raise ValueError
 
-    def remove_all_lightweight(self):
+    def __remove_all_ligthweight(self) -> bool:
         """
             Load the document into Poppler, render pages on a new PDFSurface.
         """
@@ -65,6 +65,11 @@ class PDFParser(abstract.AbstractParser):
         return True
 
     def remove_all(self):
+        if self.lightweight_cleaning is True:
+            return self.__remove_all_ligthweight()
+        return self.__remove_all_complete()
+
+    def __remove_all_complete(self) -> bool:
         """
             Load the document into Poppler, render pages on PNG,
             and shove those PNG into a new PDF.
diff --git a/mat2 b/mat2
index 987e439..722b3f7 100755
--- a/mat2
+++ b/mat2
@@ -87,8 +87,7 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
         print("[-] %s's format (%s) is not supported" % (filename, mtype))
         return False
     p.unknown_member_policy = policy
-    if is_lightweight:
-        return p.remove_all_lightweight()
+    p.lightwith_cleaning = is_lightweight
     return p.remove_all()
 
 
diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 957503d..e6f238e 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -182,6 +182,25 @@ class TestRevisionsCleaning(unittest.TestCase):
         os.remove('./tests/data/revision_clean.cleaned.docx')
 
 class TestLightWeightCleaning(unittest.TestCase):
+    def test_msoffice(self):
+        shutil.copy('./tests/data/dirty.docx', './tests/data/clean.docx')
+        p = office.MSOfficeParser('./tests/data/clean.docx')
+
+        meta = p.get_meta()
+        self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin')
+
+        p.lightweight_cleaning = True
+        ret = p.remove_all()
+        self.assertTrue(ret)
+
+        # FIXME blocked by https://0xacab.org/jvoisin/mat2/issues/73 
+        #p = office.MSOfficeParser('./tests/data/clean.cleaned.docx')
+        #expected_meta = {'creation-date': -1, 'format': 'docx-1.5', 'mod-date': -1}
+        #self.assertEqual(p.get_meta(), expected_meta)
+
+        os.remove('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.cleaned.docx')
+
     def test_pdf(self):
         shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
         p = pdf.PDFParser('./tests/data/clean.pdf')
@@ -189,7 +208,8 @@ class TestLightWeightCleaning(unittest.TestCase):
         meta = p.get_meta()
         self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
 
-        ret = p.remove_all_lightweight()
+        p.lightweight_cleaning = True
+        ret = p.remove_all()
         self.assertTrue(ret)
 
         p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
@@ -206,7 +226,8 @@ class TestLightWeightCleaning(unittest.TestCase):
         meta = p.get_meta()
         self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
 
-        ret = p.remove_all_lightweight()
+        p.lightweight_cleaning = True
+        ret = p.remove_all()
         self.assertTrue(ret)
 
         p = images.PNGParser('./tests/data/clean.cleaned.png')
-- 
GitLab