From 4e39b012cd47311b826e843ca8d9087948c32b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <remi.oudin@lip6.fr> Date: Wed, 17 Jun 2020 22:47:38 +0200 Subject: [PATCH] Fix #118 : replace xmlid with standard data --- libmat2/office.py | 51 ++++++++++++++++++++++++++++++++ tests/data/dirty_with_xmlid.odt | Bin 0 -> 8679 bytes tests/test_deep_cleaning.py | 35 ++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 tests/data/dirty_with_xmlid.odt diff --git a/libmat2/office.py b/libmat2/office.py index 3a06624..ebf3689 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -442,6 +442,11 @@ class MSOfficeParser(ZipParser): class LibreOfficeParser(ZipParser): + """ The methods modifying XML documents are usually doing so in two loops: + 1. finding the tag/attributes to remove; + 2. actually editing the document. + since it's tricky to modify the XML while iterating on it. + """ mimetypes = { 'application/vnd.oasis.opendocument.text', 'application/vnd.oasis.opendocument.spreadsheet', @@ -489,6 +494,50 @@ class LibreOfficeParser(ZipParser): tree.write(full_path, xml_declaration=True) return True + @staticmethod + def __remove_xmlid(full_path: str) -> bool: + """ + xml:id are random identifiers that can be used to ease the merging of + some components of a document. They can also be used for + fingerprinting. + + See the spec for more details: http://officeopenxml.com/WPnumbering.php + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + # The id tag appears always after a text:list. + if 'text' not in namespace.keys(): + return True + + parent_map = {c:p for p in tree.iter() for c in p} + + to_randomize = list() + for element in tree.iterfind('.//text:list', namespace): + to_randomize.append(element) + + for uri, ns in ET._namespace_map.items(): + if (ns == 'xml'): + namespace_xml = uri + + assert namespace_xml + + identifier = 0 + for element in to_randomize: + back_elem = element + element.set( + '{' + namespace_xml + '}id', + 'list' + str(random.randint(0, identifier)) + ) + identifier += 1 + parent_map[back_elem] = element + + tree.write(full_path, xml_declaration=True) + return True + def _specific_cleanup(self, full_path: str) -> bool: if os.stat(full_path).st_size == 0: # Don't process empty files return True @@ -497,6 +546,8 @@ class LibreOfficeParser(ZipParser): if os.path.basename(full_path) == 'content.xml': if self.__remove_revisions(full_path) is False: return False + if self.__remove_xmlid(full_path) is False: + return False try: _sort_xml_attributes(full_path) diff --git a/tests/data/dirty_with_xmlid.odt b/tests/data/dirty_with_xmlid.odt new file mode 100644 index 0000000000000000000000000000000000000000..2bbbc7251ca831ef093060784d71d5ba005a5e90 GIT binary patch literal 8679 zcmWIWW@Zs#VBlb2(9Jv?5O>UoN1cIz0fadi7#MOhb5lzy3sMsc3UV@&6H7Al^YqK| zQuOi@i!+P$@(WV)Qu32ab5rw5^h#1IN>B}BKmrmB3=Gcsd1;yHrA1(4ijDM3^7C_& z5{vXn)0LK!pIDNLPnQ%Hy9)9PN(*vR^GZpy4WtFX8)dNAmYAHJnv+_TSdvewJ3(Pm zkeHX6gU6lNY%D0sPcKR>F2?UxY#~!zl2}qoK%YDoKW648rl%I`J7t#SCKeP!oy)<& z0ZvCqG0(sQN=J!#nQ5uTC3;0EX=f*&&1*IgXnAkbntJYrgFY*3k;bW=S@RE=wneb& zf0`w9Z(rZ!u)rm&Ud_Mqc%7T^{k|=;-zLhO=-#yAV2DYwrs^q=jIA%Vt#<3}TzcE; z){Sl2^0Q<=W;HLJueGZ@Ps=iXTC1)(PtN(xi!>@`eYWx6qjd19MWF}Ju`PWZoN~QR znO~pONLikH#F-WzSn_(ogv##<w%vEFRxa4YmAEZvZ~HcdiH#1uoIa;oN}6SM`FS5T zp7uG=^5DX^kB%qoz1+OWCgFp2-}xK&I(U{njeD?n!@HQ~`!!QDs@^46m0mvj@xJnu zggHj<{9o<1uKfS}pWgZZl8@GJDRaqjIJY?c*M#R=KFNo=uMhBM=h*T+;}R<)14A=2 z0|O)vU<*EQHcTx^)T_wN>79JKZ?=I*+xJ>7mvruD^AZ@9OD|pxS$n1DO6-#x=^fD< zCK@{b`erI_JkwL`?!*5z&p*k}e)RE@**WbM^SlFhEtKEE$Dt)DUi#8beqUAf_l>_o zPD~f~V>I?`7i;+zb>WEN%0<VwCauxDyZM@|TZW8#eNXMmED=`6o~$DsYWK9;KW^Tg z=5unXwML`r%gTP%%!S8iCEaY?p!-3(`}xLcpO-#-bbfDM;=Jb>eNj^`_P%DjdPZb1 zht!N$-!;BCr$lN#W%Cc#saxC2qWipep8x;s#JhKMlK$?Cs(2DswtMpRRWoFRSzV+L z8Q*T!;#zco+ep8IKe6eu$nnnkZx-3`UeeLOFt?{Eux9eCM+(OSpDoXv^!AViCwI!m zrP>DT&K|AK2`FCs>UiRR-FtsyqVKfaPrtZ-OXI#Sy9=xLY%t|8I2jb`|7XdA-#Id= z`#wH)=l74{nf`cvZP^*2=Uf(LUjNuu1q#{JNLC!Sx&37Ay`cK)1$)huzs(9xzh-J_ z`XJDfBX-v_xwMJfm+TXKn)&KW;M;X43NDMR6n%NlY{HtxK&`Z`e^*`Hyl^F3D)U5L z-7|B0)7LG0&tdBOu3?4Nh27>&@Ba2*wFgD~%y%}uhZz|d&f<-DP`)WnEh)*&OD_gR z{@$qj#kVX3>h|wfI$Z5FNiFlR%1eo$sC6m^GfawWg}*hGPTVTz$@k&@eXlkXRg0x& zyVMWd*nR2Wk8ii9>f7rE6@FmxDxMPF`?%b_mwAHVv6yq~q`p6X_we|!<*)ba6<R$N z+>+GuNo0xTl?0x<i}+s@u8bBurMaBFNHzL|$#vb6K^Js|w;C|-tXuQ-)4OTGVKI>$ zdh+X~It?RN-<)&)`TJ+@?=PHX+OMytH|yQ~uWLJg)GVGFyXMT@nxc}DTRL_2_utmN z-L>JW-rYlC_v2?gTT-}p+u^F;*Qc-ED<a>1#~~}Xc*Cxosj;RRC)cX;eNJC(y3O|c zYvcPb|2Z&9Ctr*BpFB<SuX4aSsq603&t6w56TjWQ@}uJS#^Y)?wSGML@#T-R?Zf!S z^ON45wzISPRS@J`HRC}Ii{Vj;5+Swp1?DC`0;~o%cbI9lXIWKkT3fL0p;)rUWV=+S z_d5R!Qg13wH`Gz6%U$!9@!7nE3E`*j`rPX_pSG&^(X|_9)3l{-pVIB!J9+cs$FGu3 z8$JE<<;$W~A!pT@zRlP6dvnkDwx{gWWgWf;Hb-r`ox!tg>Cf=OhxP)O4Lfh~Y-5?a zvemtI+mSmZ0Z&%mIrDPr7paF~9{I|4?{rtj-(GrJsd-h(+vlRWmoF=FbnqOBbrrYK zv7I2eR`8Lb&Y1?S7a<C%e|ua{bDi22Q`4D}TEG54;cwoMj&l9GM_XeIon0RIXZ@J) z;1ZAMtDZG)51#S+$D!le=)30L>^~FCv>)WpnKJF@E-9UTd^4x-di|tdGI6^8L)j$; zE)Tu;Z1tY)+Hc&tQ|D`;uc~?1*;tR?g7?L?H0s?v8O_2Pw5agqPS)<%qLY-GIF7e% z;r0kPvZ3?%?;FRSn;4&-?K!=<WB(_KFB3Gk)z{xC$eo;MF!$C0sYTDOWlX#Bw%$9@ z&amrMc(~5jT?#DkrcXCGu*RRq?o5QsZBF5$+l${7E$!XAr{0?_Qf=BbPvfJTo=ns1 zDwt|yd~CPP+=frdmkmoDwO#Zod}r&~u-!V?^7oahiMl|$w{QIqucYORnlsfB1l_vQ zpZWS;-TO?0b(ZnVc;ER9FY5!0CnQz+f4x6FKd14r(ear2X@&=LDpl7?Gu$gvsF>zx z^JL=5sneg9vl=DqY`T|t>PpZ4SLWXitzx~H5#Ha?TRNl1;f?(=jVWwOWl;`ynM(gm z?B6S(zPb0{TH|T*KdV@pf<AOgy_NgQv}sD?b<=772i|*b5-Oe9m2rI4wx3FY&eIxS zY56!C{Ct*?Bp>x|^9;!iQpX$u{O^B%KhbZU$j)DFfo4X&u13Byd$#w8XB?htsc6`B zJjwWJ$>v8XiXF+`T4C$t7Qc?QQm8SJxShMT_SyA2h8+$!V>VANofOmK;nmi`IbXG% z<+w|-4*S}JhcvjS-fda_ZRu&T^7Y46cHd4teCC_+kr-D~l^{p%67|Gsam`VaU$eVz z*HXHXx8^bP*6Lr<8ZUobHkcmb7kmEc;_?KR+nf(qrQGqKxH(65O70uEjsxj+1yeWu z{M=NdH1h>>_x*sN#e$jwhZ&FNYq^|tS-`brtx<r_{Q28<rey8({*V`$pWt+OO8YOH zOEtphW!}xN|5H%*Z6jM~uVUQuV%?mAzjYI&bxxi9r)zMc*Y?J%!@?c)YmDAbUdz&S zUZv8^WX9TdZn;<2q<-Wa_YgfY@BF*J42hy2x9-UfeA9h^Q`mfUaoTB_J-!DWel9P_ zeo$kW{c&ZQ!B26e$K`%cbG{$-E4-Icy2?G<<LUi#dDoY!`OMVN&HVbS#A)|@)m(*> z6)BedtMa}wo2r=oKGyZhc;`~x8S@XQ8F1cVbi8;dFvRBkof7|VJHM5$OX3f*mG%CT zHi_f*=aNk~qWyohJi98*=iJ(6)HUg)^d768--o^Gn#(i(o!k2G^3uC)bq~~brW&Om zx~}tSy71PEJKlS@N*}K-+Gl*5!$d;*RHLC+aKz;IZ~vU;=Pcztm|&$kH|`db__?wt zLML`KFx{HS+sP*!s8>_US)1~)qr%iH@M%J&x_fCv+VZw<9X%gH_?Pr@*~k6jJN2Q1 z^ZdPC)44zSvDiB(ma6a1^vNpit~F~+fA@H~6^F#bKhh2S+mdg$3qE>wDweyj?Bl;D zJ086@;Y-?A^h#;T+G}-vhjq?G3q5uGb)Gr1JE5%a_io`QvlchJKk7Y$`KI8C+LZgc z#-6iJ&DWi{uCr1*A-llw^k0s^F9I5jOBFRN?Y|#lZB@{HaB9X}xnGvPal9Yj-nNxX zGbu{)Y_SfB{H&zE-1&;H)O_8FjA=|?WbYlA@^q2cw(0v0Ua9$NRkZny>)Y4=89;Sj z_SDFzU2F^tuXJIJ9ax{Et2WJd0V4wg2!px_AsMB)NqLExImP-VFs5EXUV3MMpF1y? z6c+;n1FxrtOArGC!&U|cMg|UM1_p*he~-UpU|=u~@Ck8kX=#}}dGee&a~3XKxO(;K z&6_vx+O_NO;ln3RoVax9(!G249z1yP>eZ_+U%veN_m6>r;s5{t`Hh!^85kHfJY5_^ zDsH{K?#<O=Aixr^_~(krU;g%6ze_jgUidIn?}J8HOU^_?X;TFFnjf~l`0P~ASnngt zuW#BuF-P&cWtH^y*!>qv*X&Tfz3%&_t;(_2*Tyd1y*@s0eZ<Kte<i=zor(-zdA-Nt z)Hd<mb?&zFC;Qpn&e?fo{_D!jSAoK1e~+whm^FC8ul+TQj}|<cbo>8;tDttir>mdK zI;Vst+)Y?elogj$=A?q#F>hn;uDiWMaNql2iKv|S8Ijx0_urlK=BQY=-h`A!-&?m? zbpy@Jr)W&L^09E;FP+*gcAo7^Gy*rJoL0}Q7PFI1U@o*f$Md}3IqKAG`FTcWl`2j_ z^XyqJmWcUy$bD{kcDwz-`}uWF-w*QV|NqjqT(kG2)x*M>6Jr!t^7WkH^f~wRmXPIh z{)2uilVscWDD0G-TeOk)`R1EiTh2S0s!uCgnDwV_|NM92Y`<S#54b!*$zN;DMQa^~ z*shwh8<!N#ecW?mYuELtNtd(UHeOHDEOW5SJARH;qpLD3!PUIyhmTO>_vD=!IuaM= z>v5GHvNB(&z$$)f;nzpUxh~(hmbEq8ZGCw2-<y(w6>m~{zdpA!oK^qsbLN()wNB1! z7g|r<n7Cq@RmQ2~uRg0tdi`wpUtu+k*FW*zo^RDLt0Sj0Nvck^Q0aN0Y1FeUP<C1L zM3ud(nZ)&-X0(gC^-p=TfAv<YnI@<GE~z9<?CC$~`e?oVF`c|cUOBI0{3>TQCr$V- zYX2!jz|oY`*f{s@qYJ)Wlf66A79P8@Z%NBe(JL41HXaa36}lKBc(8k``R=#Dw)K1B zdrk899h&`dWzvCvZ}yxqbLsiU{^Os<?Sz@XC;z+8%_a8X!AH*>Wr5j+&$~7;?v(7Y zS9Z`^T_qFslx6i@hgtXA-#2Fb`~EL>@*5@T?7-e!(R~pYl1grFc@bvtIY*&<v4B8_ z&8rrEwX0oweT(Jxlv<~pN-Xy&^a*fEQoFxzquu&;^>2~c-vs_JW!2iGNb?=#E0b$K zrD0vt!7IFXw<pu-8mCJ^hc7$T3ANN5+1XQ-ViD!tq`FA=<iXPd`9=OqZPcI2w#FDZ z8vW$Bd`T{`{ut}@`j<>^UwE>zoG-ZNBocB|AiO^!K=^>TMtXR4ocV#?fU~!rX@~sQ z{we!^vCZ0P?UIWxIciO+?e&WbvC&U(%M{>C^)bzh_F10k6cn>*)yI-K+Do@FiCast z^F`Y(X4=N5B+}{@s_&w8&|Wx$U!r~0-@8eb)y>lny1e(_u+<<e_}#PBCT$#kKdboH z2<T7o{4z)RZ;bCm>p356nQ|3nFJ5*wn639{&Gd>F0-`1>IBqF6r8FkL5V_dL{j2qL z)2fA^e@}kkzIVkB2`<w-o$JaO{O3=d;JLPOc~*N)@T?@Yy&PH_4jj9un=L)*m~xEP zx>SdVi!BA*N7vj)R6np{qacg(YYx3d8?#*b&I&JDz1^<#hm^JN9HkAD9Sc4<y}BTE zZEaij)4mWr)yA#QZk2AkajPR>k&5opwk$c9o`v4OSYKS+6Oh*PuqOJEF3-|KJV!Px z)mr%Z<@H-UE(iVk?Sv=JzWVi)%Z9ZN_e7gY^-Ozha51MU_{z2tEsIyv=PnM}WpSuq zzHjNPzF8BOH-0g9%I=6QanJC*dvV>3SzkW4FDQ)FzI*&i!>#KFPo7}8UCelJ`swzB zJmWb{+g~rgyYR-uJ3mAW=cqgoT)e`w&doS#eQrvLc&gm$OGa0YGW#&NG4jvnuNS|l zu`qz8NS7t5clL^tzISC`Jvm$9w1sn9T+%+aAKW|z_D4EaC*0j3ta8}Iymub=zQ+%A zymr2J%K83wu|h_2jK}&cd-MHo91m)5>-ljpmET=jujJT^f|g#Er0&K7+1K&f5}f<q zbF@$NOR85};?R}fdFy5J%yVapn`#@)p4dArQ;kqLm{42zE;XVlv2>QHXZe};`7G*j z+xRy%H~xy_{knS6>z;3n^AjFy|M_M+uRmXH*4))Cl41d!2U1$^eVtys`L+3D>2*`= zgdCj(&YGS)7S5x|SU-J6>GnsrIG4qq@ZH^TQ%?TL&P^r(#Xd^ACVjr2e|7iO+{hK( z+##x4onr6p-#ydWcFSUOFaFTuac}*_cX>?>R9K$RD|cmw8_(-IBH?RGCvQogv{j!m zVY7~eU4^F6$CL4s8_u6wdvH(Gz239hm(|{I+<m98a?y)tzdwdmvA;cBJgt7w9)q(* zE?(Q2itMx$7UxS$eD^46&fK~q&*psRRc7T;{?MAbW7XF9;I^~+Oy9~b$;?cdo#b>f zHtt-T_h*Oo>BqnDWK=!O;ADS)QE$81>J_$=|GO(1uG{(9WZlcD=N2cwO1IcyS9`c) zQQQ5K9lzY}{a>8+k$tm?2Yc+UH-A^Yxi6jdWC`=?=<}<#8<#y^y6HN<yHNecDGQiW zo=n`pm8ZL&{XkQxeR!J3{lw%IFSdG@rmkI9R{Lu5jwQ<uwksIE^j~GHa9CQwxhJo< z_WIr9+n85Y%swb$Rklx^^Oj6bZ+qD7%{A-u9Au-uc{HDi=e=Zh;?Y%w+-4mx^XzKJ zTWmXPO+9Wal&@#Ylz#FLB)2Ax_fqDWwD6RPW+%-jKfGGsy&(G4iv2UMJ@FLzVPjL7 z5p3iZvYp?o|KG~x`zy^2P9Kb^v}gJM@A>|?4e>S)zvf%`?c15TmhZ>26Rv0dXY(^= z9b4UOue`G8!{&8%)g9JVk5x(>4IZrFj<MVy6@KshqSJFS)%WC->#e&o_o7METD`*M zvp;OQyK_%e`MtPjXKstXKQ&vs^ywcV_jB=kr1RHb?Y;ao_4iVcn#%I6@AmA@*<T_Y zx6S(f`YUt0U%IAVI~2c7J3RdLsqU!xN97lMy|~>t{N?GVC1p34y;S=X|L5hzzursd zXe$Mt`8*}>^sBvVXEm&n-yVE!YTditZP|I<UtWLcpE1?te@!m?U6+?L&+g%Ueg0(Z z8iljxYk5Qd@3Z!Do}})~b*YZ^u+@>`OBol`(>{n-{}ycA$+_wF+YO~!YQ3T*E+Hnf zjvuPoyrN(5-goh~=+8G>>vDxEE=gE6i7tQ3IkEP^(yY7O`}8+yykIQ7r(!t8*7wWv zgwH3qGo1_c&CZ5j-hEIsOzA)aSE1wA=jl^d?+joH7A?8IOtNBfnq5iO2R7HG7cQK> z=A*DRGmhCz`ySV<EAkVM82??)AGovPp7))En=_Kl_RhX*CiTN2Fpp0zuW<8=<B_?= z^Cmm$1x)`E-Vu62wz=o1|Ap1v5BLr}RXOz`c(I1Y&6?^EMitFDow+q<mTi8qFS4)t zudGOGhCxMdXIdaX+wxzkpY1$~Fa8zqIG6D^i`TDCf9a-~iJy=5acku1CFSqSk9)oC z-`tu1_bA<#|DCthy6Nlci_a9l_&(s3GtS!d#X?uxsV(*F;%Q;LS$^vG^S+gSyL59k zpX|59|Mn~&zJW)OI$GKvI&d;DcsRgX)7XX%xfvK3lJoOQK*RGDxj9E8yz?KM3GDkG zE}?Zn+WGXmZP}(D<HQ)bR5Aj0yXJF8Ey>w7WyPc^^8fca2Tszw9kk=MYjvT+x$kGY zjh(gn<@<Kdcr)`-jF^AmAA?@CH6E%@kABnqRr|N+p1b_C`!=U^9&389@L4@alx^uI zgAe6QZ=BTqw3gO%l^vVHd2CZk>+RYJS>5TEUo!13y;2ag_W82I@_g;>hxZ&?SARIH zjmKK*&Z6yIL51ABvz}LUYZ|h=wY09eZPaHy`F_fg?xRfCFY&GV+3-r@|Ic5WyvlXd zJH&5IIK-XcF2WM9QfZ<~@mvv=bSuNmq~?6%sTB{F_^rQv;dicv_<{-V_o%exvwWT! z;L5i!N=J<E>hjf>dSq@`uZhr{ntAb?jZ<%Aj($|Jkd8$h_qUst#wt^CHrjpHsQCE3 zKr$ke`)taJiHpTFCNsAyZ4NLsV!pS=`plN%&1<Htn7c$XttqnW*hl9(TMt*PuC2eA z6>>JSV&<8Z^1f=JbvY;IE_7qIp5{~Nt;Q6?#5BpI+0D@Bq>-ie2hn9NOP+fL-OgK< zvPo+}ZzA8_p!`fv|HG-uJ<n9;C3U}C^vLJJh7~HOzNUIiyK7%`n&1B0ty#y-Q-try znchej58aoj^`jv%D6UkZrBXt7Mf0NxjQsQZ4!srp+Eo2IZ-t|-^9g;832Lk7wcX=T zOfxrNz4tg=;F|q<{kog&X<s%SDqU3}X`>~(EBuU+{GPs?{*8O*vfpjk{Ol`3V%yvw zHJ2}L&)=1C>B*&Pu``QOf3p?vW#7q7`5@*X=_$ELN3`nai&Lv_&)c!p_fYT`?v#Ym zQ|3Z>@jGrrg`7M3(y1aWN#Fmh!eaL&b4B{4E5mx$dMtb_Z59=IzDanIuJ(cj3$}c1 z%>FxVsyJV6`1QvrtXtl+ZG7*c`PF~A!*=Ue4U;T2t2!pl7V)a^7g=BZ)Ob$u#eHw3 zA27uf@7#HKYyTVlPwuAgrnPexzFfk4`u)d~tGo2IXDIN02sm!?dx=rOySC*ku3g?5 z^!N7X@7H&1zPck)$aP!Sl-*$r(%+WG=}$;r`+(8Ab5Y>x8~ZK?aGFa8w;9{Ml>5Y4 zV%O89dFGks-1M!V=jki=o&BeM{6xRWU&T)<a}`S(Im_$nrhn#jDY(nN=fJV$H@^h` zv2WkLPfP0c@#og1sRI1h@|ACRRJ{_rcJ9;0e=Zl^%;&N*-)*<!-kdAZQ(gy65S}!( zQQX>@>7B!m9nIRC#dd$F6I*oY7_af`$FZlb<(|K{{OGgqajNonr{BI?HSg!PpgcZa z_4{J$Yjsq<9eK+X=kc+|u=e^2sa3E0dLAsCVgCBlEN0zpG2Y73@~5uN^v+tKb$H9q zs8tUV|7;B@JnSAa*WB&OnLpfzxAMFz6_)!k|IDX&P(M6STA7QHg@NIUAik1Al!1Z4 z*EPgZ*VE5UKNmK6Q<0mqcJleG!v;LZ-q&_Lc-^u0tBKELIj-uacZ_>n{8;;P{wz1i zw^ujuJ*HuDVRq`t{qHAFC~wbB@8bT^Fiq>*!TXVnA-CC9uD$X7<?25v?SIp*YdY~N zo_ZPZq+H}(;?YU!nV|<1JWB;q)+{Y`=v^DkF7ot#m8I6*#jTCHyBGGhzCQAG?Hq?& zYFQy?u9_?Eu=-%JWXi;_>@|*h9E&g21)tH)FbQ~8bpJeCiP5qWt$EsNa=&J^Gw`l5 z`o^cVY>sh6f7_8(DZ6tG`q9q~Zzm-6wl7xg&HbcomUDFSEk(ieYd^Q_i4(fk>0f@w z{1@lLn6J}Z!?PShny<3|NlZ#IHF4;v%s+hk;PT>Ebsh^jmHgb_t*BLKnw#aw^Rx6@ za{hy_&pxXiN#vgY_}!8H>-YU<1%<cb8{Nygj0_B_ETHgaWD;S(J=+TEO(QUD?lr(0 zRTn}90|RKh1i|;kGEE1XP=rhh5^G8`7E=(DuDDGBO)etbvJ;Cb$dj<R%|Z3kM=a(b zCS`G(0-7X6xJI23bNU%|k`}i~sQyaCViG8fVUxDF%|Uh1Of2T0PU7M=3DrfHv6zHB zsf*hj(6lZhjJTOF!w6%N7h)K!v4K2;37W1#80NvmzyL2dic->Gi5R96c?t|P!->$@ z&dk7&3!U#o*M&TYh-zO0D+2@aWEo5+u0ckGeI+~$47i3IVWz-bjoj=44Qe4w@#14( zKn!uA>q9PdL5(tmzC+?DO*C|k$fXCUOh#z@BaNek4)A7W11S?^5M)SYVPMFS1@Qnk Cs+(f~ literal 0 HcmV?d00001 diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index aab46c7..67f6dad 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -5,6 +5,7 @@ import shutil import os import zipfile import tempfile +import time from libmat2 import office, parser_factory @@ -168,3 +169,37 @@ class TestNsidRemoval(unittest.TestCase): os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx') + +class TestXMLidRandomize(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/dirty_with_xmlid.odt', + './tests/data/clean.odt') + p = office.LibreOfficeParser('./tests/data/clean.odt') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + how_many_rsid = False + with zipfile.ZipFile('./tests/data/clean.odt') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('xml:id') + how_many_rsid += num + self.assertEqual(how_many_rsid, 1) + + ret = p.remove_all() + self.assertTrue(ret) + + num = 0 + with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num += zin.read(item).decode('utf-8').lower().count('xml:id') + self.assertEqual(num, 1) + + os.remove('./tests/data/clean.odt') + shutil.copyfile('./tests/data/clean.cleaned.odt', + "/home/neha/test.odt") + os.remove('./tests/data/clean.cleaned.odt') -- GitLab