From 4e39b012cd47311b826e843ca8d9087948c32b20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Oudin?= <remi.oudin@lip6.fr>
Date: Wed, 17 Jun 2020 22:47:38 +0200
Subject: [PATCH] Fix #118 : replace xmlid with standard data

---
 libmat2/office.py               |  51 ++++++++++++++++++++++++++++++++
 tests/data/dirty_with_xmlid.odt | Bin 0 -> 8679 bytes
 tests/test_deep_cleaning.py     |  35 ++++++++++++++++++++++
 3 files changed, 86 insertions(+)
 create mode 100644 tests/data/dirty_with_xmlid.odt

diff --git a/libmat2/office.py b/libmat2/office.py
index 3a06624..ebf3689 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -442,6 +442,11 @@ class MSOfficeParser(ZipParser):
 
 
 class LibreOfficeParser(ZipParser):
+    """ The methods modifying XML documents are usually doing so in two loops:
+            1. finding the tag/attributes to remove;
+            2. actually editing the document.
+        since it's tricky to modify the XML while iterating on it.
+    """
     mimetypes = {
         'application/vnd.oasis.opendocument.text',
         'application/vnd.oasis.opendocument.spreadsheet',
@@ -489,6 +494,50 @@ class LibreOfficeParser(ZipParser):
         tree.write(full_path, xml_declaration=True)
         return True
 
+    @staticmethod
+    def __remove_xmlid(full_path: str) -> bool:
+        """
+        xml:id are random identifiers that can be used to ease the merging of
+        some components of a document.  They can also be used for
+        fingerprinting.
+
+        See the spec for more details: http://officeopenxml.com/WPnumbering.php
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        # The id tag appears always after a text:list.
+        if 'text' not in namespace.keys():
+            return True
+
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        to_randomize = list()
+        for element in tree.iterfind('.//text:list', namespace):
+            to_randomize.append(element)
+
+        for uri, ns in ET._namespace_map.items():
+            if (ns == 'xml'):
+                namespace_xml = uri
+
+        assert namespace_xml
+
+        identifier = 0
+        for element in to_randomize:
+            back_elem = element
+            element.set(
+                '{' + namespace_xml + '}id',
+                'list' + str(random.randint(0, identifier))
+            )
+            identifier += 1
+            parent_map[back_elem] = element
+
+        tree.write(full_path, xml_declaration=True)
+        return True
+
     def _specific_cleanup(self, full_path: str) -> bool:
         if os.stat(full_path).st_size == 0:  # Don't process empty files
             return True
@@ -497,6 +546,8 @@ class LibreOfficeParser(ZipParser):
             if os.path.basename(full_path) == 'content.xml':
                 if self.__remove_revisions(full_path) is False:
                     return False
+                if self.__remove_xmlid(full_path) is False:
+                    return False
 
             try:
                 _sort_xml_attributes(full_path)
diff --git a/tests/data/dirty_with_xmlid.odt b/tests/data/dirty_with_xmlid.odt
new file mode 100644
index 0000000000000000000000000000000000000000..2bbbc7251ca831ef093060784d71d5ba005a5e90
GIT binary patch
literal 8679
zcmWIWW@Zs#VBlb2(9Jv?5O>UoN1cIz0fadi7#MOhb5lzy3sMsc3UV@&6H7Al^YqK|
zQuOi@i!+P$@(WV)Qu32ab5rw5^h#1IN>B}BKmrmB3=Gcsd1;yHrA1(4ijDM3^7C_&
z5{vXn)0LK!pIDNLPnQ%Hy9)9PN(*vR^GZpy4WtFX8)dNAmYAHJnv+_TSdvewJ3(Pm
zkeHX6gU6lNY%D0sPcKR>F2?UxY#~!zl2}qoK%YDoKW648rl%I`J7t#SCKeP!oy)<&
z0ZvCqG0(sQN=J!#nQ5uTC3;0EX=f*&&1*IgXnAkbntJYrgFY*3k;bW=S@RE=wneb&
zf0`w9Z(rZ!u)rm&Ud_Mqc%7T^{k|=;-zLhO=-#yAV2DYwrs^q=jIA%Vt#<3}TzcE;
z){Sl2^0Q<=W;HLJueGZ@Ps=iXTC1)(PtN(xi!>@`eYWx6qjd19MWF}Ju`PWZoN~QR
znO~pONLikH#F-WzSn_(ogv##<w%vEFRxa4YmAEZvZ~HcdiH#1uoIa;oN}6SM`FS5T
zp7uG=^5DX^kB%qoz1+OWCgFp2-}xK&I(U{njeD?n!@HQ~`!!QDs@^46m0mvj@xJnu
zggHj<{9o<1uKfS}pWgZZl8@GJDRaqjIJY?c*M#R=KFNo=uMhBM=h*T+;}R<)14A=2
z0|O)vU<*EQHcTx^)T_wN>79JKZ?=I*+xJ>7mvruD^AZ@9OD|pxS$n1DO6-#x=^fD<
zCK@{b`erI_JkwL`?!*5z&p*k}e)RE@**WbM^SlFhEtKEE$Dt)DUi#8beqUAf_l>_o
zPD~f~V>I?`7i;+zb>WEN%0<VwCauxDyZM@|TZW8#eNXMmED=`6o~$DsYWK9;KW^Tg
z=5unXwML`r%gTP%%!S8iCEaY?p!-3(`}xLcpO-#-bbfDM;=Jb>eNj^`_P%DjdPZb1
zht!N$-!;BCr$lN#W%Cc#saxC2qWipep8x;s#JhKMlK$?Cs(2DswtMpRRWoFRSzV+L
z8Q*T!;#zco+ep8IKe6eu$nnnkZx-3`UeeLOFt?{Eux9eCM+(OSpDoXv^!AViCwI!m
zrP>DT&K|AK2`FCs>UiRR-FtsyqVKfaPrtZ-OXI#Sy9=xLY%t|8I2jb`|7XdA-#Id=
z`#wH)=l74{nf`cvZP^*2=Uf(LUjNuu1q#{JNLC!Sx&37Ay`cK)1$)huzs(9xzh-J_
z`XJDfBX-v_xwMJfm+TXKn)&KW;M;X43NDMR6n%NlY{HtxK&`Z`e^*`Hyl^F3D)U5L
z-7|B0)7LG0&tdBOu3?4Nh27>&@Ba2*wFgD~%y%}uhZz|d&f<-DP`)WnEh)*&OD_gR
z{@$qj#kVX3>h|wfI$Z5FNiFlR%1eo$sC6m^GfawWg}*hGPTVTz$@k&@eXlkXRg0x&
zyVMWd*nR2Wk8ii9>f7rE6@FmxDxMPF`?%b_mwAHVv6yq~q`p6X_we|!<*)ba6<R$N
z+>+GuNo0xTl?0x<i}+s@u8bBurMaBFNHzL|$#vb6K^Js|w;C|-tXuQ-)4OTGVKI>$
zdh+X~It?RN-<)&)`TJ+@?=PHX+OMytH|yQ~uWLJg)GVGFyXMT@nxc}DTRL_2_utmN
z-L>JW-rYlC_v2?gTT-}p+u^F;*Qc-ED<a>1#~~}Xc*Cxosj;RRC)cX;eNJC(y3O|c
zYvcPb|2Z&9Ctr*BpFB<SuX4aSsq603&t6w56TjWQ@}uJS#^Y)?wSGML@#T-R?Zf!S
z^ON45wzISPRS@J`HRC}Ii{Vj;5+Swp1?DC`0;~o%cbI9lXIWKkT3fL0p;)rUWV=+S
z_d5R!Qg13wH`Gz6%U$!9@!7nE3E`*j`rPX_pSG&^(X|_9)3l{-pVIB!J9+cs$FGu3
z8$JE<<;$W~A!pT@zRlP6dvnkDwx{gWWgWf;Hb-r`ox!tg>Cf=OhxP)O4Lfh~Y-5?a
zvemtI+mSmZ0Z&%mIrDPr7paF~9{I|4?{rtj-(GrJsd-h(+vlRWmoF=FbnqOBbrrYK
zv7I2eR`8Lb&Y1?S7a<C%e|ua{bDi22Q`4D}TEG54;cwoMj&l9GM_XeIon0RIXZ@J)
z;1ZAMtDZG)51#S+$D!le=)30L>^~FCv>)WpnKJF@E-9UTd^4x-di|tdGI6^8L)j$;
zE)Tu;Z1tY)+Hc&tQ|D`;uc~?1*;tR?g7?L?H0s?v8O_2Pw5agqPS)<%qLY-GIF7e%
z;r0kPvZ3?%?;FRSn;4&-?K!=<WB(_KFB3Gk)z{xC$eo;MF!$C0sYTDOWlX#Bw%$9@
z&amrMc(~5jT?#DkrcXCGu*RRq?o5QsZBF5$+l${7E$!XAr{0?_Qf=BbPvfJTo=ns1
zDwt|yd~CPP+=frdmkmoDwO#Zod}r&~u-!V?^7oahiMl|$w{QIqucYORnlsfB1l_vQ
zpZWS;-TO?0b(ZnVc;ER9FY5!0CnQz+f4x6FKd14r(ear2X@&=LDpl7?Gu$gvsF>zx
z^JL=5sneg9vl=DqY`T|t>PpZ4SLWXitzx~H5#Ha?TRNl1;f?(=jVWwOWl;`ynM(gm
z?B6S(zPb0{TH|T*KdV@pf<AOgy_NgQv}sD?b<=772i|*b5-Oe9m2rI4wx3FY&eIxS
zY56!C{Ct*?Bp>x|^9;!iQpX$u{O^B%KhbZU$j)DFfo4X&u13Byd$#w8XB?htsc6`B
zJjwWJ$>v8XiXF+`T4C$t7Qc?QQm8SJxShMT_SyA2h8+$!V>VANofOmK;nmi`IbXG%
z<+w|-4*S}JhcvjS-fda_ZRu&T^7Y46cHd4teCC_+kr-D~l^{p%67|Gsam`VaU$eVz
z*HXHXx8^bP*6Lr<8ZUobHkcmb7kmEc;_?KR+nf(qrQGqKxH(65O70uEjsxj+1yeWu
z{M=NdH1h>>_x*sN#e$jwhZ&FNYq^|tS-`brtx<r_{Q28<rey8({*V`$pWt+OO8YOH
zOEtphW!}xN|5H%*Z6jM~uVUQuV%?mAzjYI&bxxi9r)zMc*Y?J%!@?c)YmDAbUdz&S
zUZv8^WX9TdZn;<2q<-Wa_YgfY@BF*J42hy2x9-UfeA9h^Q`mfUaoTB_J-!DWel9P_
zeo$kW{c&ZQ!B26e$K`%cbG{$-E4-Icy2?G<<LUi#dDoY!`OMVN&HVbS#A)|@)m(*>
z6)BedtMa}wo2r=oKGyZhc;`~x8S@XQ8F1cVbi8;dFvRBkof7|VJHM5$OX3f*mG%CT
zHi_f*=aNk~qWyohJi98*=iJ(6)HUg)^d768--o^Gn#(i(o!k2G^3uC)bq~~brW&Om
zx~}tSy71PEJKlS@N*}K-+Gl*5!$d;*RHLC+aKz;IZ~vU;=Pcztm|&$kH|`db__?wt
zLML`KFx{HS+sP*!s8>_US)1~)qr%iH@M%J&x_fCv+VZw<9X%gH_?Pr@*~k6jJN2Q1
z^ZdPC)44zSvDiB(ma6a1^vNpit~F~+fA@H~6^F#bKhh2S+mdg$3qE>wDweyj?Bl;D
zJ086@;Y-?A^h#;T+G}-vhjq?G3q5uGb)Gr1JE5%a_io`QvlchJKk7Y$`KI8C+LZgc
z#-6iJ&DWi{uCr1*A-llw^k0s^F9I5jOBFRN?Y|#lZB@{HaB9X}xnGvPal9Yj-nNxX
zGbu{)Y_SfB{H&zE-1&;H)O_8FjA=|?WbYlA@^q2cw(0v0Ua9$NRkZny>)Y4=89;Sj
z_SDFzU2F^tuXJIJ9ax{Et2WJd0V4wg2!px_AsMB)NqLExImP-VFs5EXUV3MMpF1y?
z6c+;n1FxrtOArGC!&U|cMg|UM1_p*he~-UpU|=u~@Ck8kX=#}}dGee&a~3XKxO(;K
z&6_vx+O_NO;ln3RoVax9(!G249z1yP>eZ_+U%veN_m6>r;s5{t`Hh!^85kHfJY5_^
zDsH{K?#<O=Aixr^_~(krU;g%6ze_jgUidIn?}J8HOU^_?X;TFFnjf~l`0P~ASnngt
zuW#BuF-P&cWtH^y*!>qv*X&Tfz3%&_t;(_2*Tyd1y*@s0eZ<Kte<i=zor(-zdA-Nt
z)Hd<mb?&zFC;Qpn&e?fo{_D!jSAoK1e~+whm^FC8ul+TQj}|<cbo>8;tDttir>mdK
zI;Vst+)Y?elogj$=A?q#F>hn;uDiWMaNql2iKv|S8Ijx0_urlK=BQY=-h`A!-&?m?
zbpy@Jr)W&L^09E;FP+*gcAo7^Gy*rJoL0}Q7PFI1U@o*f$Md}3IqKAG`FTcWl`2j_
z^XyqJmWcUy$bD{kcDwz-`}uWF-w*QV|NqjqT(kG2)x*M>6Jr!t^7WkH^f~wRmXPIh
z{)2uilVscWDD0G-TeOk)`R1EiTh2S0s!uCgnDwV_|NM92Y`<S#54b!*$zN;DMQa^~
z*shwh8<!N#ecW?mYuELtNtd(UHeOHDEOW5SJARH;qpLD3!PUIyhmTO>_vD=!IuaM=
z>v5GHvNB(&z$$)f;nzpUxh~(hmbEq8ZGCw2-<y(w6>m~{zdpA!oK^qsbLN()wNB1!
z7g|r<n7Cq@RmQ2~uRg0tdi`wpUtu+k*FW*zo^RDLt0Sj0Nvck^Q0aN0Y1FeUP<C1L
zM3ud(nZ)&-X0(gC^-p=TfAv<YnI@<GE~z9<?CC$~`e?oVF`c|cUOBI0{3>TQCr$V-
zYX2!jz|oY`*f{s@qYJ)Wlf66A79P8@Z%NBe(JL41HXaa36}lKBc(8k``R=#Dw)K1B
zdrk899h&`dWzvCvZ}yxqbLsiU{^Os<?Sz@XC;z+8%_a8X!AH*>Wr5j+&$~7;?v(7Y
zS9Z`^T_qFslx6i@hgtXA-#2Fb`~EL>@*5@T?7-e!(R~pYl1grFc@bvtIY*&<v4B8_
z&8rrEwX0oweT(Jxlv<~pN-Xy&^a*fEQoFxzquu&;^>2~c-vs_JW!2iGNb?=#E0b$K
zrD0vt!7IFXw<pu-8mCJ^hc7$T3ANN5+1XQ-ViD!tq`FA=<iXPd`9=OqZPcI2w#FDZ
z8vW$Bd`T{`{ut}@`j<>^UwE>zoG-ZNBocB|AiO^!K=^>TMtXR4ocV#?fU~!rX@~sQ
z{we!^vCZ0P?UIWxIciO+?e&WbvC&U(%M{>C^)bzh_F10k6cn>*)yI-K+Do@FiCast
z^F`Y(X4=N5B+}{@s_&w8&|Wx$U!r~0-@8eb)y>lny1e(_u+<<e_}#PBCT$#kKdboH
z2<T7o{4z)RZ;bCm>p356nQ|3nFJ5*wn639{&Gd>F0-`1>IBqF6r8FkL5V_dL{j2qL
z)2fA^e@}kkzIVkB2`<w-o$JaO{O3=d;JLPOc~*N)@T?@Yy&PH_4jj9un=L)*m~xEP
zx>SdVi!BA*N7vj)R6np{qacg(YYx3d8?#*b&I&JDz1^<#hm^JN9HkAD9Sc4<y}BTE
zZEaij)4mWr)yA#QZk2AkajPR>k&5opwk$c9o`v4OSYKS+6Oh*PuqOJEF3-|KJV!Px
z)mr%Z<@H-UE(iVk?Sv=JzWVi)%Z9ZN_e7gY^-Ozha51MU_{z2tEsIyv=PnM}WpSuq
zzHjNPzF8BOH-0g9%I=6QanJC*dvV>3SzkW4FDQ)FzI*&i!>#KFPo7}8UCelJ`swzB
zJmWb{+g~rgyYR-uJ3mAW=cqgoT)e`w&doS#eQrvLc&gm$OGa0YGW#&NG4jvnuNS|l
zu`qz8NS7t5clL^tzISC`Jvm$9w1sn9T+%+aAKW|z_D4EaC*0j3ta8}Iymub=zQ+%A
zymr2J%K83wu|h_2jK}&cd-MHo91m)5>-ljpmET=jujJT^f|g#Er0&K7+1K&f5}f<q
zbF@$NOR85};?R}fdFy5J%yVapn`#@)p4dArQ;kqLm{42zE;XVlv2>QHXZe};`7G*j
z+xRy%H~xy_{knS6>z;3n^AjFy|M_M+uRmXH*4))Cl41d!2U1$^eVtys`L+3D>2*`=
zgdCj(&YGS)7S5x|SU-J6>GnsrIG4qq@ZH^TQ%?TL&P^r(#Xd^ACVjr2e|7iO+{hK(
z+##x4onr6p-#ydWcFSUOFaFTuac}*_cX>?>R9K$RD|cmw8_(-IBH?RGCvQogv{j!m
zVY7~eU4^F6$CL4s8_u6wdvH(Gz239hm(|{I+<m98a?y)tzdwdmvA;cBJgt7w9)q(*
zE?(Q2itMx$7UxS$eD^46&fK~q&*psRRc7T;{?MAbW7XF9;I^~+Oy9~b$;?cdo#b>f
zHtt-T_h*Oo>BqnDWK=!O;ADS)QE$81>J_$=|GO(1uG{(9WZlcD=N2cwO1IcyS9`c)
zQQQ5K9lzY}{a>8+k$tm?2Yc+UH-A^Yxi6jdWC`=?=<}<#8<#y^y6HN<yHNecDGQiW
zo=n`pm8ZL&{XkQxeR!J3{lw%IFSdG@rmkI9R{Lu5jwQ<uwksIE^j~GHa9CQwxhJo<
z_WIr9+n85Y%swb$Rklx^^Oj6bZ+qD7%{A-u9Au-uc{HDi=e=Zh;?Y%w+-4mx^XzKJ
zTWmXPO+9Wal&@#Ylz#FLB)2Ax_fqDWwD6RPW+%-jKfGGsy&(G4iv2UMJ@FLzVPjL7
z5p3iZvYp?o|KG~x`zy^2P9Kb^v}gJM@A>|?4e>S)zvf%`?c15TmhZ>26Rv0dXY(^=
z9b4UOue`G8!{&8%)g9JVk5x(>4IZrFj<MVy6@KshqSJFS)%WC->#e&o_o7METD`*M
zvp;OQyK_%e`MtPjXKstXKQ&vs^ywcV_jB=kr1RHb?Y;ao_4iVcn#%I6@AmA@*<T_Y
zx6S(f`YUt0U%IAVI~2c7J3RdLsqU!xN97lMy|~>t{N?GVC1p34y;S=X|L5hzzursd
zXe$Mt`8*}>^sBvVXEm&n-yVE!YTditZP|I<UtWLcpE1?te@!m?U6+?L&+g%Ueg0(Z
z8iljxYk5Qd@3Z!Do}})~b*YZ^u+@>`OBol`(>{n-{}ycA$+_wF+YO~!YQ3T*E+Hnf
zjvuPoyrN(5-goh~=+8G>>vDxEE=gE6i7tQ3IkEP^(yY7O`}8+yykIQ7r(!t8*7wWv
zgwH3qGo1_c&CZ5j-hEIsOzA)aSE1wA=jl^d?+joH7A?8IOtNBfnq5iO2R7HG7cQK>
z=A*DRGmhCz`ySV<EAkVM82??)AGovPp7))En=_Kl_RhX*CiTN2Fpp0zuW<8=<B_?=
z^Cmm$1x)`E-Vu62wz=o1|Ap1v5BLr}RXOz`c(I1Y&6?^EMitFDow+q<mTi8qFS4)t
zudGOGhCxMdXIdaX+wxzkpY1$~Fa8zqIG6D^i`TDCf9a-~iJy=5acku1CFSqSk9)oC
z-`tu1_bA<#|DCthy6Nlci_a9l_&(s3GtS!d#X?uxsV(*F;%Q;LS$^vG^S+gSyL59k
zpX|59|Mn~&zJW)OI$GKvI&d;DcsRgX)7XX%xfvK3lJoOQK*RGDxj9E8yz?KM3GDkG
zE}?Zn+WGXmZP}(D<HQ)bR5Aj0yXJF8Ey>w7WyPc^^8fca2Tszw9kk=MYjvT+x$kGY
zjh(gn<@<Kdcr)`-jF^AmAA?@CH6E%@kABnqRr|N+p1b_C`!=U^9&389@L4@alx^uI
zgAe6QZ=BTqw3gO%l^vVHd2CZk>+RYJS>5TEUo!13y;2ag_W82I@_g;>hxZ&?SARIH
zjmKK*&Z6yIL51ABvz}LUYZ|h=wY09eZPaHy`F_fg?xRfCFY&GV+3-r@|Ic5WyvlXd
zJH&5IIK-XcF2WM9QfZ<~@mvv=bSuNmq~?6%sTB{F_^rQv;dicv_<{-V_o%exvwWT!
z;L5i!N=J<E>hjf>dSq@`uZhr{ntAb?jZ<%Aj($|Jkd8$h_qUst#wt^CHrjpHsQCE3
zKr$ke`)taJiHpTFCNsAyZ4NLsV!pS=`plN%&1<Htn7c$XttqnW*hl9(TMt*PuC2eA
z6>>JSV&<8Z^1f=JbvY;IE_7qIp5{~Nt;Q6?#5BpI+0D@Bq>-ie2hn9NOP+fL-OgK<
zvPo+}ZzA8_p!`fv|HG-uJ<n9;C3U}C^vLJJh7~HOzNUIiyK7%`n&1B0ty#y-Q-try
znchej58aoj^`jv%D6UkZrBXt7Mf0NxjQsQZ4!srp+Eo2IZ-t|-^9g;832Lk7wcX=T
zOfxrNz4tg=;F|q<{kog&X<s%SDqU3}X`>~(EBuU+{GPs?{*8O*vfpjk{Ol`3V%yvw
zHJ2}L&)=1C>B*&Pu``QOf3p?vW#7q7`5@*X=_$ELN3`nai&Lv_&)c!p_fYT`?v#Ym
zQ|3Z>@jGrrg`7M3(y1aWN#Fmh!eaL&b4B{4E5mx$dMtb_Z59=IzDanIuJ(cj3$}c1
z%>FxVsyJV6`1QvrtXtl+ZG7*c`PF~A!*=Ue4U;T2t2!pl7V)a^7g=BZ)Ob$u#eHw3
zA27uf@7#HKYyTVlPwuAgrnPexzFfk4`u)d~tGo2IXDIN02sm!?dx=rOySC*ku3g?5
z^!N7X@7H&1zPck)$aP!Sl-*$r(%+WG=}$;r`+(8Ab5Y>x8~ZK?aGFa8w;9{Ml>5Y4
zV%O89dFGks-1M!V=jki=o&BeM{6xRWU&T)<a}`S(Im_$nrhn#jDY(nN=fJV$H@^h`
zv2WkLPfP0c@#og1sRI1h@|ACRRJ{_rcJ9;0e=Zl^%;&N*-)*<!-kdAZQ(gy65S}!(
zQQX>@>7B!m9nIRC#dd$F6I*oY7_af`$FZlb<(|K{{OGgqajNonr{BI?HSg!PpgcZa
z_4{J$Yjsq<9eK+X=kc+|u=e^2sa3E0dLAsCVgCBlEN0zpG2Y73@~5uN^v+tKb$H9q
zs8tUV|7;B@JnSAa*WB&OnLpfzxAMFz6_)!k|IDX&P(M6STA7QHg@NIUAik1Al!1Z4
z*EPgZ*VE5UKNmK6Q<0mqcJleG!v;LZ-q&_Lc-^u0tBKELIj-uacZ_>n{8;;P{wz1i
zw^ujuJ*HuDVRq`t{qHAFC~wbB@8bT^Fiq>*!TXVnA-CC9uD$X7<?25v?SIp*YdY~N
zo_ZPZq+H}(;?YU!nV|<1JWB;q)+{Y`=v^DkF7ot#m8I6*#jTCHyBGGhzCQAG?Hq?&
zYFQy?u9_?Eu=-%JWXi;_>@|*h9E&g21)tH)FbQ~8bpJeCiP5qWt$EsNa=&J^Gw`l5
z`o^cVY>sh6f7_8(DZ6tG`q9q~Zzm-6wl7xg&HbcomUDFSEk(ieYd^Q_i4(fk>0f@w
z{1@lLn6J}Z!?PShny<3|NlZ#IHF4;v%s+hk;PT>Ebsh^jmHgb_t*BLKnw#aw^Rx6@
za{hy_&pxXiN#vgY_}!8H>-YU<1%<cb8{Nygj0_B_ETHgaWD;S(J=+TEO(QUD?lr(0
zRTn}90|RKh1i|;kGEE1XP=rhh5^G8`7E=(DuDDGBO)etbvJ;Cb$dj<R%|Z3kM=a(b
zCS`G(0-7X6xJI23bNU%|k`}i~sQyaCViG8fVUxDF%|Uh1Of2T0PU7M=3DrfHv6zHB
zsf*hj(6lZhjJTOF!w6%N7h)K!v4K2;37W1#80NvmzyL2dic->Gi5R96c?t|P!->$@
z&dk7&3!U#o*M&TYh-zO0D+2@aWEo5+u0ckGeI+~$47i3IVWz-bjoj=44Qe4w@#14(
zKn!uA>q9PdL5(tmzC+?DO*C|k$fXCUOh#z@BaNek4)A7W11S?^5M)SYVPMFS1@Qnk
Cs+(f~

literal 0
HcmV?d00001

diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py
index aab46c7..67f6dad 100644
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@@ -5,6 +5,7 @@ import shutil
 import os
 import zipfile
 import tempfile
+import time
 
 from libmat2 import office, parser_factory
 
@@ -168,3 +169,37 @@ class TestNsidRemoval(unittest.TestCase):
 
         os.remove('./tests/data/clean.docx')
         os.remove('./tests/data/clean.cleaned.docx')
+
+class TestXMLidRandomize(unittest.TestCase):
+    def test_office(self):
+        shutil.copy('./tests/data/dirty_with_xmlid.odt',
+                    './tests/data/clean.odt')
+        p = office.LibreOfficeParser('./tests/data/clean.odt')
+
+        meta = p.get_meta()
+        self.assertIsNotNone(meta)
+
+        how_many_rsid = False
+        with zipfile.ZipFile('./tests/data/clean.odt') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('xml:id')
+                how_many_rsid += num
+        self.assertEqual(how_many_rsid, 1)
+
+        ret = p.remove_all()
+        self.assertTrue(ret)
+
+        num = 0
+        with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num += zin.read(item).decode('utf-8').lower().count('xml:id')
+        self.assertEqual(num, 1)
+
+        os.remove('./tests/data/clean.odt')
+        shutil.copyfile('./tests/data/clean.cleaned.odt',
+                        "/home/neha/test.odt")
+        os.remove('./tests/data/clean.cleaned.odt')
-- 
GitLab