From bee56a57ce0f45c51386423d508ba06836be366e Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Sun, 1 Jul 2018 23:11:10 +0200 Subject: [PATCH] Remove docx revisions --- libmat2/office.py | 81 +++++++++++++++++++++++++++++++-------- tests/data/revision.docx | Bin 0 -> 4701 bytes tests/test_libmat2.py | 21 ++++++++++ 3 files changed, 86 insertions(+), 16 deletions(-) create mode 100644 tests/data/revision.docx diff --git a/libmat2/office.py b/libmat2/office.py index 5381eb9..acd8ca2 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -14,6 +14,24 @@ from . import abstract, parser_factory assert Set assert Pattern +def _parse_xml(full_path: str): + """ This function parse XML with namespace support. """ + def parse_map(f): # etree support for ns is a bit rough + ns_map = dict() + for event, (k, v) in ET.iterparse(f, ("start-ns", )): + if event == "start-ns": + ns_map[k] = v + return ns_map + + ns = parse_map(full_path) + + # Register the namespaces + for k,v in ns.items(): + ET.register_namespace(k, v) + + return ET.parse(full_path), ns + + class ArchiveBasedAbstractParser(abstract.AbstractParser): # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. @@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): zin.extract(member=item, path=temp_folder) full_path = os.path.join(temp_folder, item.filename) - self._specific_cleanup(full_path) + if self._specific_cleanup(full_path) is False: + shutil.rmtree(temp_folder) + os.remove(self.output_filename) + print("Something went wrong during deep cleaning of %s" % item.filename) + return False if item.filename in self.files_to_keep: # those files aren't supported, but we want to add them anyway @@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser): '^docProps/', })) + def __remove_revisions(self, full_path:str) -> bool: + """ In this function, we're changing the XML + document in two times, since we don't want + to change the tree we're iterating on.""" + tree, ns = _parse_xml(full_path) + + # No revisions are present + if tree.find('.//w:del', ns) is None: + return True + elif tree.find('.//w:ins', ns) is None: + return True + + parent_map = {c:p for p in tree.iter( ) for c in p} + + elements = list([element for element in tree.iterfind('.//w:del', ns)]) + for element in elements: + parent_map[element].remove(element) + + elements = list() + for element in tree.iterfind('.//w:ins', ns): + for position, item in enumerate(tree.iter()): + if item == element: + for children in element.iterfind('./*'): + elements.append((element, position, children)) + break + + for (element, position, children) in elements: + parent_map[element].insert(position, children) + parent_map[element].remove(element) + + tree.write(full_path, xml_declaration=True) + + return True + + def _specific_cleanup(self, full_path:str) -> bool: + if full_path.endswith('/word/document.xml'): + return self.__remove_revisions(full_path) + return True + def get_meta(self) -> Dict[str, str]: """ Yes, I know that parsing xml with regexp ain't pretty, @@ -168,27 +229,16 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): def __remove_revisions(self, full_path:str) -> bool: - def parse_map(f): # etree support for ns is a bit rough - ns_map = dict() - for event, (k, v) in ET.iterparse(f, ("start-ns", )): - if event == "start-ns": - ns_map[k] = v - return ns_map - - ns = parse_map(full_path) + tree, ns = _parse_xml(full_path) + if 'office' not in ns.keys(): # no revisions in the current file return True - # Register the namespaces - for k,v in ns.items(): - ET.register_namespace(k, v) - - tree = ET.parse(full_path) for text in tree.getroot().iterfind('.//office:text', ns): for changes in text.iterfind('.//text:tracked-changes', ns): text.remove(changes) - tree.write(full_path, xml_declaration = True) + tree.write(full_path, xml_declaration=True) return True @@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): metadata[key] = value zipin.close() return metadata - diff --git a/tests/data/revision.docx b/tests/data/revision.docx new file mode 100644 index 0000000000000000000000000000000000000000..8a2d814cbf0422bee6878ca960587c0413de5f8d GIT binary patch literal 4701 zcmWIWW@Zs#;Nak3NUnY8!+-?185kJii&Arn_4PpH+DX3N%#J*5@BfNAzq^~G-ErdF zHzlWhrLXKQGvB<saE{$LbM60oj$FF}x)vPYQ~CLW%%$9O{}%Yo>F#zsyeiYN(BR@O zgRF|%w)*l3m-Tmr+fBIQH8VzQ)xDNQi9(+J9Y=lMtTZt!TrpcQ%|Gb4O_I{)<r53< zZu>r?-e|#m!>iY_PDJl!+WOn$kGWZu*d~=0wY0cpG3z&5=!yDOrd&)|T)DFFXO_uS zo{HFvztsyZc+PzYHgP}ixUTYJq(#KG?Nc?DM!S{&&os%eIlbq2&c(P#-DejazA61> zet<VS$BPS$$~PGp7;Kps7;uM(00RR<N`7)cQGP+Oequp^UPW%s+=-|24l4+_e*eqG zoht48=9HJR&z1IFT~|CxC%;qT5}tj^hDF8e-@Y9aW7hAUeOmUMEZeSqFYK(F`A^M~ z(k*H-GC1UwdP}O-Wc_E=Q%+CZKHZaAaZ!{hYP0pOuh$y;xtykPs9jmneQ~V;-y-I- z&hv66&sqF+)^U~E?zbOTu<VH_yp*7%$Fb5NMoEIR=$)IwF2V5UKjK$xNt&VmX4YzN z8#n0->z)fr9Ia1HR(>zNv{CocM~xSYi}woIEbHCS`|Hz(Z_<sc=R8S`J7acgj$Zru z)Y292HhUgcw0$V;a-=20sM=~}+&S*9<rcf1>b5Ap54*7UtI>u<OMZl1eBM{Bm-I{E z+XOxTCr56rzOmO@kL}8$Ic2_craY~Dz4JAbcKd-qRYSeme;GhQ9Ja2nOO=s<fr$xU z5DOv)adLi9DkzL=gM9rD8}RJ?ti9^aZI#yv%CD?AFZlU5MX3vHpHLO;Hrvf}SAV_# zt+lBKj-35gUHPqejt_sT;F|*xkt;bD2|6-r1U?N~tDZ6Q>>)$0HIKEPyDm*=>Pxt2 zve&aEZIRK3BrV=!o*Np)gQjuXI!Zq`Ieepwcgh{Nc?)fJtnxWI*Z-_kww~QjsSgbk ztYexUTb9e1H!XZ8dNI;SeD9KVz8ATkFgY`-YNQ+XYW#a<Ww3PbuLZ7pGnSUP^M<dK zy|I};?AwvQb@Fc?PkZ%z`H$RxdotFx1(~iEYPj^ohC`TbN3+cbrZ-pCImbOdzC5Sx z65~P>mB$nAq)plI(6;+!byNM`d=~%0)JwUs9svi}O$uApc1xL8n_au-%DMWCTTS*{ zcKSP_uGjt6Tv4zyL$h|}&+ARm=HJCPOqyUXv-jB4DD$|=b${QVpDo?ml2&r0%}{wU zcl}(4+cyvRnYGV2mV4PQciFbD$HiK#BVP&2TfgM8n7aGJ`NC!Kpg3C{V*9X^k%8d_ z-Z+zCU|=ZEFG|selol!Z$)&lec_pBjgOnT-eYp-92(*5mr*!7tvQUAE%ko?bw7)VK z&fRv<jPs;MRW);0l*Xe&Pyf`uH`I@h|J@gRfjzgkLC3RkVc@n)%LEHFwqI6V)9tzM zxL;5V=i0Kyqte1GTU_#vRtD&PGg7&>uygXQUvWV?wTI`1sNYnWzw~9(Z+|usX0_X) zOn=s$YM5%?zJEh*Y<8>k>T|YIQWGCE=E=5sOxfos^kK#16O49x_P>|c<nm7abmQUe zgR8%uJ@RM0y~CmzPoIm=o4dT#^Bns_v2svw{M0u%aFl_8K^SjvfRZpcIEqtCN;31( zi$MY5I>FcPumex)_c=<lZ09xz*auGP@;%xlI)!;h=j5IllKsc>oGfkg7I)p<Jm3EJ z$*<qke<U3{)0dR|&TOfsLD`kv7G}?~_wTMXW92>~XXBc4hVy35?b%f;C0t9oxbF(= zjFK|$TyiSvof^*pX)lSZ(i-dnx}P(|Pt~5fqPTg{!P~z#)ofvmF<HCrk1E&EAk{On zZWk^1mu^gVHY&L<Z?U`BXWy18nFlXY`P=;$F1f<9&(QzrX|Dr;FWyd3>rprr^Ow=) zvDL*u4~5*^|4ZdAz1Xo><GsLf?UWFO4SO~<Z?)QSZ{;TOU%xuO@q@xCz2r>uO9lo8 zUq*acT?m$J((?04LK2g5z<K>_&}qNJ1_FCNi%xrgC6k{+=@M6?MX2S4PmI##m)AUT ze^M&_>-!GRszw<BEg|#jbQ@#qPmf<6mMT1Q;7P3F^UXe7CAZ$6c+35BU+vz#)0>oc z$V$B5FhM<KXVg8LvdXuyDHG>epWE$ZX1Hx<&>HD22aNA8@{V}h+&W`j;(9CjvfK|+ zGbisd&6=26eu~M`@>I+G6F+vT_3FpT>+G`VU@;N6%yuW>bdlvU;n;TvmR*S6bYjsF zzde=Q>(pPcF+HAH%o2P;XY#6g_GxEc8u8lhWsmCGwew8B0Ebp{x(9FCeIIl6=t|?4 zTX$@`9=ymW+IQViS;POAvV29`_T_9m=urBeOKlqG-WN4*qkUCp$M0@QHwa;!wdUcg zG}W804jisq^o2j^Z+(}<=G+4xOV&Kx{U>n7#{;MRL1BJNwyf5Kk%1wG1z(uM(<X8z zJsa+wf7?P}-}i6<t1oHZMRSg%CX09TZhMi~%}~g<+dN9j)8z+~XXMv^zxcjx_>`4n za#E?mu}kLD?Dx`HZeNa0mk*kt8*zMVl-o;_PL35m&ts;Zev$U~dWxXn^O>h+Uf`N` zkuT;;>eq*h7VbQk#d%8DC7|zBYDlzeY*=GTo>FDn*2}H)Q=)=mMQob)b^nqrlMKBl zEp`0wvXy2FE`()h&pf_R_i^6o)c4x+R8L0cI2!cV#2)RHzOe25&--46YqI-3-pZIQ zz1p~dk6F?6dAQNKZw8q~Ee{$t-%srOI4Anrox;lbCsNa81n(-Ju<g~n^kVV#MGA4V zY!2*RH=|K-ZH?ru@YUJI3r@a~*gEOv?OQD!k~SB4#qYe2dZ@gz<eyO(fAY8LiidwE zO8yPg%Dw59prjzc`DT&mvqNsrcVC#jZRe)k^0(zo9?si+bQZsN+}iOdVPm4B&9}d7 z>txnF`)PDYB}Zd*?NUScyen%j^(xhGd7N-^raRM#Ttm-uYMYN%l{r0s{-xw&zV|V^ zPZuwkd9O6qIXw4R6Z7uYjIHZ-DE~-SS@ZR7-!@mH@0LsVAIx-|{6x}EyR&rJg9*a? z_J5}QS)BbyE4F)1hHBXnMge(=q^F7zO&ymkLR1+aCj3%)Tw&2XUB-oPk+^O3*Jg(| zrd_eMzfbg(%~x%+>rnrl<<%*8Kt^EUc88SZ#Wze>9i15-^n1qBdnZpHJpTRplOF-| z{3n&nop<?X&$-L-t)F<$)v8$6=cjBua-{s0eeCVeyK}9-&slLfT<9ye*!Ms3>nyqw zjGAp@D$d>O`X=zEDbcc%P0;_$wZn4`If>j2-^q2eI^ginYL34T1N8r~&*Ck;_bIwU z;#ij~-{a-wg2&j})?d+WsQhRBMB%gTt#fZKM?bVKK6>ikSEHw2>oX2^XIjlZdpp0r z_55<~ns?%V&gz`w(y)=bG<lm%WL#Kzo9G6aoci()w|b^IW$)jfv*mkK-?2^K_HHcy zx_8>^zAQySyY`;<-hn%I#YVoJe`4YUw&`~k#q62EUzT`%_PL(<+t*KfzNFi^K{rCT zCP86GllHsK3oJ7&(?mC4kG(P3?rxjKvBR<2a{n9F)G>jw+?5?6FHM*k7>@Gb%X0j% zVy?KPGA9*W8C(s!U3AMpp!R)u#TP#w$z{Htb9ht_w@h0&H|E#AZPz0@b#_jYaoV?U z(uT7~QyOB@qU1J|l-#y`Uh#Bu_wt^~ehpDk8-=-x%|1?WS6WshHDUMd#MdnqUd;WC zoPCTl7b}TBpMNgRcIk7Tsp;2>ug+#wxoY+P=f19>GY+zyGCL)`Th2JlUN`5LvEt*H zE{>^2x6{wq=WVzDEg9gqD%a}nF*nKHB>_qo-6x)?nW$pow0&yiz0BQV$5!!f%9!t9 zG=bmsp3(283o<<<uNI5*eB89cd{&X7Vv1U{Fu$ew&SOfuj<9eooBH-t{H8b4Y`;rz zt($88y0J@x_XL9kuc|7W=kJ)c${b$bMOzzuTEy=07ChzM_y0p4+dWf1ZnjJ3x<VX| zDJ{LV`-uAk1J=#Awzu0Zc#?KVc|(JYzQWfSW>cd8(`&H_JU_M^ny~#{-nGS*e$maH ztxi(2JA!XX`Mz=5?{oRSkKO$@J1w=(S6#`!@Iof#+@;TV3g$MqtZcE_dVKf7z?_$c zn+(;N+Id!A@V>G1OZtl?Utcq%?Un33nv?KlLQ|->Py&~@wn@vXny|$@!nyqCS|9%L z{=GWaxJ4q>VMk$|r_<IT(MrAE{@<-);h$|><BZp)2FfK{uDS4yX;=R>zAw{r_P$um zdG?S~<#i?IcfLo$^J}WSjXyv0E!gG|uCwj_lco2jSMcxWe!l-yfph1JUwRKb3>06h zyjj1$pLw_ZONaEnTHcaW7n7^Y9xmXD*I?y(Jili4ZVS7YT>DMm<}1zH|6(`$k(y1L z5?-=Sb#{q;q&O?6Wm^9>UJ;>O+iRA2@{icU4UR3Z_4J*?`_tT=|3bCv;mNvE^Y*xQ z&D!UytMlUK#kpBip6J|6new<aG23)T_{1q4y<b|^OxWvIS}uKcy>FJ!)I60Z2Xc3= zvCi0^^;Le##MtRKc3+vozxOKNr5c8vX7{~$Z)(>}@7^BL_sU69`nzA3j!Arq{uSP6 z&A3?ed%^#fZ7BI@A5b07zCJYJNODcqhy4!sd;iuqUfF)-)&-GSe$6V@k6KH4ttPB! zlP#aUai-JNpy&0Bpj<Wox7T$^W(I}^ytzu4fq@~~IX|x?HLoN-q_QBjI2P2*+B?PH zzr{e{==<NA9_!h<<a!Jb$obg3lKYU8W$mE6#drC--S_WlZaN|%T=utIQL*;@-+Mls zN2|=goD{Rk%JwRo_f&^bdF`cTx;LF`XUA<U-KzC!FSCzF_J)*Lj(4+)r^xa!8NWVo z!87@<0@L1K)2tQeyK?_gI<LQTi&_I`W{2b&t#_Qyw>G6H*X$5xp3-Aq^<rZB$CHz0 zmpi9S^Rc*|t;zKDar9!li<de@<}PNle*2+^rKjFEC@gPJ=?P7X?p>1$wq<T%IaMKf zV$#9BIb0#HG=yp{6v#wJ@h?BjzuE2k#3SLauWq@s)8yp;<crEuiY&jTd^kS4FVkcb zi_59SoO~y{MWvdqJzDwy*rDlGXD)WD+Ufl-w?3oZd?D?p-JyN$`*-vjmLFQrW%o~i zl{qN13(79O)MaE~hyynp7@0&E5Q728onmgJfdL2;(%1(#@B_S2HHIN|^+0`Qhz1ZT z#E990M%Rtp=>WA?K>8rKlnF^UtaFI29l4PX>X|^af=Epk{QVSklaQOSs1C7Ug_?xi z#7EbT+#&<DbwN&m-~=|Pc9fPcx_;#91=ad1>`?v4tvGb;$oU9V2SKa`k?!13?T8u) lT{CjB0A)3hHVB@}hc)8`c(byB6mT+dGT1ONFmQw89ROn1)$srT literal 0 HcmV?d00001 diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1573790..4df6385 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase): zipin.close() os.remove('./tests/data/clean.cleaned.odt') + os.remove('./tests/data/clean.odt') class TestRevisionsCleaning(unittest.TestCase): @@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase): os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.cleaned.odt') + def test_msoffice(self): + with zipfile.ZipFile('./tests/data/revision.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' + self.assertIn(r, content) + + shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx') + p = office.MSOfficeParser('./tests/data/revision_clean.docx') + self.assertTrue(p.remove_all()) + + with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' + self.assertNotIn(r, content) + + os.remove('./tests/data/revision_clean.docx') + os.remove('./tests/data/revision_clean.cleaned.docx') + class TestDeepCleaning(unittest.TestCase): def __check_deep_meta(self, p): -- GitLab