From bee56a57ce0f45c51386423d508ba06836be366e Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 1 Jul 2018 23:11:10 +0200
Subject: [PATCH] Remove docx revisions

---
 libmat2/office.py        |  81 +++++++++++++++++++++++++++++++--------
 tests/data/revision.docx | Bin 0 -> 4701 bytes
 tests/test_libmat2.py    |  21 ++++++++++
 3 files changed, 86 insertions(+), 16 deletions(-)
 create mode 100644 tests/data/revision.docx

diff --git a/libmat2/office.py b/libmat2/office.py
index 5381eb9..acd8ca2 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -14,6 +14,24 @@ from . import abstract, parser_factory
 assert Set
 assert Pattern
 
+def _parse_xml(full_path: str):
+    """ This function parse XML with namespace support. """
+    def parse_map(f):  # etree support for ns is a bit rough
+        ns_map = dict()
+        for event, (k, v) in ET.iterparse(f, ("start-ns", )):
+            if event == "start-ns":
+                ns_map[k] = v
+        return ns_map
+
+    ns = parse_map(full_path)
+
+    # Register the namespaces
+    for k,v in ns.items():
+        ET.register_namespace(k, v)
+
+    return ET.parse(full_path), ns
+
+
 class ArchiveBasedAbstractParser(abstract.AbstractParser):
     # Those are the files that have a format that _isn't_
     # supported by MAT2, but that we want to keep anyway.
@@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                 zin.extract(member=item, path=temp_folder)
                 full_path = os.path.join(temp_folder, item.filename)
 
-                self._specific_cleanup(full_path)
+                if self._specific_cleanup(full_path) is False:
+                    shutil.rmtree(temp_folder)
+                    os.remove(self.output_filename)
+                    print("Something went wrong during deep cleaning of %s" % item.filename)
+                    return False
 
                 if item.filename in self.files_to_keep:
                     # those files aren't supported, but we want to add them anyway
@@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
             '^docProps/',
     }))
 
+    def __remove_revisions(self, full_path:str) -> bool:
+        """ In this function, we're changing the XML
+        document in two times, since we don't want
+        to change the tree we're iterating on."""
+        tree, ns = _parse_xml(full_path)
+
+        # No revisions are present
+        if tree.find('.//w:del', ns) is None:
+            return True
+        elif tree.find('.//w:ins', ns) is None:
+            return True
+
+        parent_map = {c:p for p in tree.iter( ) for c in p}
+
+        elements = list([element for element in tree.iterfind('.//w:del', ns)])
+        for element in elements:
+            parent_map[element].remove(element)
+
+        elements = list()
+        for element in tree.iterfind('.//w:ins', ns):
+            for position, item in enumerate(tree.iter()):
+                if item == element:
+                    for children in element.iterfind('./*'):
+                        elements.append((element, position, children))
+                    break
+
+        for (element, position, children) in elements:
+            parent_map[element].insert(position, children)
+            parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
+    def _specific_cleanup(self, full_path:str) -> bool:
+        if full_path.endswith('/word/document.xml'):
+            return self.__remove_revisions(full_path)
+        return True
+
     def get_meta(self) -> Dict[str, str]:
         """
         Yes, I know that parsing xml with regexp ain't pretty,
@@ -168,27 +229,16 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
 
 
     def __remove_revisions(self, full_path:str) -> bool:
-        def parse_map(f):  # etree support for ns is a bit rough
-            ns_map = dict()
-            for event, (k, v) in ET.iterparse(f, ("start-ns", )):
-                if event == "start-ns":
-                    ns_map[k] = v
-            return ns_map
-
-        ns = parse_map(full_path)
+        tree, ns = _parse_xml(full_path)
+
         if 'office' not in ns.keys():  # no revisions in the current file
             return True
 
-        # Register the namespaces
-        for k,v in ns.items():
-            ET.register_namespace(k, v)
-
-        tree = ET.parse(full_path)
         for text in tree.getroot().iterfind('.//office:text', ns):
             for changes in text.iterfind('.//text:tracked-changes', ns):
                 text.remove(changes)
 
-        tree.write(full_path, xml_declaration = True)
+        tree.write(full_path, xml_declaration=True)
 
         return True
 
@@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
                 metadata[key] = value
         zipin.close()
         return metadata
-
diff --git a/tests/data/revision.docx b/tests/data/revision.docx
new file mode 100644
index 0000000000000000000000000000000000000000..8a2d814cbf0422bee6878ca960587c0413de5f8d
GIT binary patch
literal 4701
zcmWIWW@Zs#;Nak3NUnY8!+-?185kJii&Arn_4PpH+DX3N%#J*5@BfNAzq^~G-ErdF
zHzlWhrLXKQGvB<saE{$LbM60oj$FF}x)vPYQ~CLW%%$9O{}%Yo>F#zsyeiYN(BR@O
zgRF|%w)*l3m-Tmr+fBIQH8VzQ)xDNQi9(+J9Y=lMtTZt!TrpcQ%|Gb4O_I{)<r53<
zZu>r?-e|#m!>iY_PDJl!+WOn$kGWZu*d~=0wY0cpG3z&5=!yDOrd&)|T)DFFXO_uS
zo{HFvztsyZc+PzYHgP}ixUTYJq(#KG?Nc?DM!S{&&os%eIlbq2&c(P#-DejazA61>
zet<VS$BPS$$~PGp7;Kps7;uM(00RR<N`7)cQGP+Oequp^UPW%s+=-|24l4+_e*eqG
zoht48=9HJR&z1IFT~|CxC%;qT5}tj^hDF8e-@Y9aW7hAUeOmUMEZeSqFYK(F`A^M~
z(k*H-GC1UwdP}O-Wc_E=Q%+CZKHZaAaZ!{hYP0pOuh$y;xtykPs9jmneQ~V;-y-I-
z&hv66&sqF+)^U~E?zbOTu<VH_yp*7%$Fb5NMoEIR=$)IwF2V5UKjK$xNt&VmX4YzN
z8#n0->z)fr9Ia1HR(>zNv{CocM~xSYi}woIEbHCS`|Hz(Z_<sc=R8S`J7acgj$Zru
z)Y292HhUgcw0$V;a-=20sM=~}+&S*9<rcf1>b5Ap54*7UtI>u<OMZl1eBM{Bm-I{E
z+XOxTCr56rzOmO@kL}8$Ic2_craY~Dz4JAbcKd-qRYSeme;GhQ9Ja2nOO=s<fr$xU
z5DOv)adLi9DkzL=gM9rD8}RJ?ti9^aZI#yv%CD?AFZlU5MX3vHpHLO;Hrvf}SAV_#
zt+lBKj-35gUHPqejt_sT;F|*xkt;bD2|6-r1U?N~tDZ6Q>>)$0HIKEPyDm*=>Pxt2
zve&aEZIRK3BrV=!o*Np)gQjuXI!Zq`Ieepwcgh{Nc?)fJtnxWI*Z-_kww~QjsSgbk
ztYexUTb9e1H!XZ8dNI;SeD9KVz8ATkFgY`-YNQ+XYW#a<Ww3PbuLZ7pGnSUP^M<dK
zy|I};?AwvQb@Fc?PkZ%z`H$RxdotFx1(~iEYPj^ohC`TbN3+cbrZ-pCImbOdzC5Sx
z65~P>mB$nAq)plI(6;+!byNM`d=~%0)JwUs9svi}O$uApc1xL8n_au-%DMWCTTS*{
zcKSP_uGjt6Tv4zyL$h|}&+ARm=HJCPOqyUXv-jB4DD$|=b${QVpDo?ml2&r0%}{wU
zcl}(4+cyvRnYGV2mV4PQciFbD$HiK#BVP&2TfgM8n7aGJ`NC!Kpg3C{V*9X^k%8d_
z-Z+zCU|=ZEFG|selol!Z$)&lec_pBjgOnT-eYp-92(*5mr*!7tvQUAE%ko?bw7)VK
z&fRv<jPs;MRW);0l*Xe&Pyf`uH`I@h|J@gRfjzgkLC3RkVc@n)%LEHFwqI6V)9tzM
zxL;5V=i0Kyqte1GTU_#vRtD&PGg7&>uygXQUvWV?wTI`1sNYnWzw~9(Z+|usX0_X)
zOn=s$YM5%?zJEh*Y<8>k>T|YIQWGCE=E=5sOxfos^kK#16O49x_P>|c<nm7abmQUe
zgR8%uJ@RM0y~CmzPoIm=o4dT#^Bns_v2svw{M0u%aFl_8K^SjvfRZpcIEqtCN;31(
zi$MY5I>FcPumex)_c=<lZ09xz*auGP@;%xlI)!;h=j5IllKsc>oGfkg7I)p<Jm3EJ
z$*<qke<U3{)0dR|&TOfsLD`kv7G}?~_wTMXW92>~XXBc4hVy35?b%f;C0t9oxbF(=
zjFK|$TyiSvof^*pX)lSZ(i-dnx}P(|Pt~5fqPTg{!P~z#)ofvmF<HCrk1E&EAk{On
zZWk^1mu^gVHY&L<Z?U`BXWy18nFlXY`P=;$F1f<9&(QzrX|Dr;FWyd3>rprr^Ow=)
zvDL*u4~5*^|4ZdAz1Xo><GsLf?UWFO4SO~<Z?)QSZ{;TOU%xuO@q@xCz2r>uO9lo8
zUq*acT?m$J((?04LK2g5z<K>_&}qNJ1_FCNi%xrgC6k{+=@M6?MX2S4PmI##m)AUT
ze^M&_>-!GRszw<BEg|#jbQ@#qPmf<6mMT1Q;7P3F^UXe7CAZ$6c+35BU+vz#)0>oc
z$V$B5FhM<KXVg8LvdXuyDHG>epWE$ZX1Hx<&>HD22aNA8@{V}h+&W`j;(9CjvfK|+
zGbisd&6=26eu~M`@>I+G6F+vT_3FpT>+G`VU@;N6%yuW>bdlvU;n;TvmR*S6bYjsF
zzde=Q>(pPcF+HAH%o2P;XY#6g_GxEc8u8lhWsmCGwew8B0Ebp{x(9FCeIIl6=t|?4
zTX$@`9=ymW+IQViS;POAvV29`_T_9m=urBeOKlqG-WN4*qkUCp$M0@QHwa;!wdUcg
zG}W804jisq^o2j^Z+(}<=G+4xOV&Kx{U>n7#{;MRL1BJNwyf5Kk%1wG1z(uM(<X8z
zJsa+wf7?P}-}i6<t1oHZMRSg%CX09TZhMi~%}~g<+dN9j)8z+~XXMv^zxcjx_>`4n
za#E?mu}kLD?Dx`HZeNa0mk*kt8*zMVl-o;_PL35m&ts;Zev$U~dWxXn^O>h+Uf`N`
zkuT;;>eq*h7VbQk#d%8DC7|zBYDlzeY*=GTo>FDn*2}H)Q=)=mMQob)b^nqrlMKBl
zEp`0wvXy2FE`()h&pf_R_i^6o)c4x+R8L0cI2!cV#2)RHzOe25&--46YqI-3-pZIQ
zz1p~dk6F?6dAQNKZw8q~Ee{$t-%srOI4Anrox;lbCsNa81n(-Ju<g~n^kVV#MGA4V
zY!2*RH=|K-ZH?ru@YUJI3r@a~*gEOv?OQD!k~SB4#qYe2dZ@gz<eyO(fAY8LiidwE
zO8yPg%Dw59prjzc`DT&mvqNsrcVC#jZRe)k^0(zo9?si+bQZsN+}iOdVPm4B&9}d7
z>txnF`)PDYB}Zd*?NUScyen%j^(xhGd7N-^raRM#Ttm-uYMYN%l{r0s{-xw&zV|V^
zPZuwkd9O6qIXw4R6Z7uYjIHZ-DE~-SS@ZR7-!@mH@0LsVAIx-|{6x}EyR&rJg9*a?
z_J5}QS)BbyE4F)1hHBXnMge(=q^F7zO&ymkLR1+aCj3%)Tw&2XUB-oPk+^O3*Jg(|
zrd_eMzfbg(%~x%+>rnrl<<%*8Kt^EUc88SZ#Wze>9i15-^n1qBdnZpHJpTRplOF-|
z{3n&nop<?X&$-L-t)F<$)v8$6=cjBua-{s0eeCVeyK}9-&slLfT<9ye*!Ms3>nyqw
zjGAp@D$d>O`X=zEDbcc%P0;_$wZn4`If>j2-^q2eI^ginYL34T1N8r~&*Ck;_bIwU
z;#ij~-{a-wg2&j})?d+WsQhRBMB%gTt#fZKM?bVKK6>ikSEHw2>oX2^XIjlZdpp0r
z_55<~ns?%V&gz`w(y)=bG<lm%WL#Kzo9G6aoci()w|b^IW$)jfv*mkK-?2^K_HHcy
zx_8>^zAQySyY`;<-hn%I#YVoJe`4YUw&`~k#q62EUzT`%_PL(<+t*KfzNFi^K{rCT
zCP86GllHsK3oJ7&(?mC4kG(P3?rxjKvBR<2a{n9F)G>jw+?5?6FHM*k7>@Gb%X0j%
zVy?KPGA9*W8C(s!U3AMpp!R)u#TP#w$z{Htb9ht_w@h0&H|E#AZPz0@b#_jYaoV?U
z(uT7~QyOB@qU1J|l-#y`Uh#Bu_wt^~ehpDk8-=-x%|1?WS6WshHDUMd#MdnqUd;WC
zoPCTl7b}TBpMNgRcIk7Tsp;2>ug+#wxoY+P=f19>GY+zyGCL)`Th2JlUN`5LvEt*H
zE{>^2x6{wq=WVzDEg9gqD%a}nF*nKHB>_qo-6x)?nW$pow0&yiz0BQV$5!!f%9!t9
zG=bmsp3(283o<<<uNI5*eB89cd{&X7Vv1U{Fu$ew&SOfuj<9eooBH-t{H8b4Y`;rz
zt($88y0J@x_XL9kuc|7W=kJ)c${b$bMOzzuTEy=07ChzM_y0p4+dWf1ZnjJ3x<VX|
zDJ{LV`-uAk1J=#Awzu0Zc#?KVc|(JYzQWfSW>cd8(`&H_JU_M^ny~#{-nGS*e$maH
ztxi(2JA!XX`Mz=5?{oRSkKO$@J1w=(S6#`!@Iof#+@;TV3g$MqtZcE_dVKf7z?_$c
zn+(;N+Id!A@V>G1OZtl?Utcq%?Un33nv?KlLQ|->Py&~@wn@vXny|$@!nyqCS|9%L
z{=GWaxJ4q>VMk$|r_<IT(MrAE{@<-);h$|><BZp)2FfK{uDS4yX;=R>zAw{r_P$um
zdG?S~<#i?IcfLo$^J}WSjXyv0E!gG|uCwj_lco2jSMcxWe!l-yfph1JUwRKb3>06h
zyjj1$pLw_ZONaEnTHcaW7n7^Y9xmXD*I?y(Jili4ZVS7YT>DMm<}1zH|6(`$k(y1L
z5?-=Sb#{q;q&O?6Wm^9>UJ;>O+iRA2@{icU4UR3Z_4J*?`_tT=|3bCv;mNvE^Y*xQ
z&D!UytMlUK#kpBip6J|6new<aG23)T_{1q4y<b|^OxWvIS}uKcy>FJ!)I60Z2Xc3=
zvCi0^^;Le##MtRKc3+vozxOKNr5c8vX7{~$Z)(>}@7^BL_sU69`nzA3j!Arq{uSP6
z&A3?ed%^#fZ7BI@A5b07zCJYJNODcqhy4!sd;iuqUfF)-)&-GSe$6V@k6KH4ttPB!
zlP#aUai-JNpy&0Bpj<Wox7T$^W(I}^ytzu4fq@~~IX|x?HLoN-q_QBjI2P2*+B?PH
zzr{e{==<NA9_!h<<a!Jb$obg3lKYU8W$mE6#drC--S_WlZaN|%T=utIQL*;@-+Mls
zN2|=goD{Rk%JwRo_f&^bdF`cTx;LF`XUA<U-KzC!FSCzF_J)*Lj(4+)r^xa!8NWVo
z!87@<0@L1K)2tQeyK?_gI<LQTi&_I`W{2b&t#_Qyw>G6H*X$5xp3-Aq^<rZB$CHz0
zmpi9S^Rc*|t;zKDar9!li<de@<}PNle*2+^rKjFEC@gPJ=?P7X?p>1$wq<T%IaMKf
zV$#9BIb0#HG=yp{6v#wJ@h?BjzuE2k#3SLauWq@s)8yp;<crEuiY&jTd^kS4FVkcb
zi_59SoO~y{MWvdqJzDwy*rDlGXD)WD+Ufl-w?3oZd?D?p-JyN$`*-vjmLFQrW%o~i
zl{qN13(79O)MaE~hyynp7@0&E5Q728onmgJfdL2;(%1(#@B_S2HHIN|^+0`Qhz1ZT
z#E990M%Rtp=>WA?K>8rKlnF^UtaFI29l4PX>X|^af=Epk{QVSklaQOSs1C7Ug_?xi
z#7EbT+#&<DbwN&m-~=|Pc9fPcx_;#91=ad1>`?v4tvGb;$oU9V2SKa`k?!13?T8u)
lT{CjB0A)3hHVB@}hc)8`c(byB6mT+dGT1ONFmQw89ROn1)$srT

literal 0
HcmV?d00001

diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py
index 1573790..4df6385 100644
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase):
         zipin.close()
 
         os.remove('./tests/data/clean.cleaned.odt')
+        os.remove('./tests/data/clean.odt')
 
 
 class TestRevisionsCleaning(unittest.TestCase):
@@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase):
         os.remove('./tests/data/clean.odt')
         os.remove('./tests/data/clean.cleaned.odt')
 
+    def test_msoffice(self):
+        with zipfile.ZipFile('./tests/data/revision.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
+            self.assertIn(r, content)
+
+        shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx')
+        p = office.MSOfficeParser('./tests/data/revision_clean.docx')
+        self.assertTrue(p.remove_all())
+
+        with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin:
+            c = zipin.open('word/document.xml')
+            content = c.read()
+            r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">'
+            self.assertNotIn(r, content)
+
+        os.remove('./tests/data/revision_clean.docx')
+        os.remove('./tests/data/revision_clean.cleaned.docx')
+
 
 class TestDeepCleaning(unittest.TestCase):
     def __check_deep_meta(self, p):
-- 
GitLab