From 174d4a0ac09c2e9d4a9aa3677a442c05459b8309 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Thu, 20 Sep 2018 22:37:53 +0200
Subject: [PATCH] Implement rsid stripping for office files

MS Office XML rsid is a "unique identifier used to track the editing session
when the physical character representing this section mark was last formatted."

See the following links for details:
- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/.
---
 libmat2/office.py                           |  61 ++++++++++++++++++--
 tests/data/office_revision_session_ids.docx | Bin 0 -> 12163 bytes
 tests/test_deep_cleaning.py                 |  31 ++++++++++
 3 files changed, 87 insertions(+), 5 deletions(-)
 create mode 100644 tests/data/office_revision_session_ids.docx

diff --git a/libmat2/office.py b/libmat2/office.py
index 5c2c996..07bbbb9 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET  # type: ignore
 
 from .archive import ArchiveBasedAbstractParser
 
+# pylint: disable=line-too-long
+
 # Make pyflakes happy
 assert Set
 assert Pattern
@@ -15,14 +17,12 @@ assert Pattern
 def _parse_xml(full_path: str):
     """ This function parses XML, with namespace support. """
 
-    cpt = 0
     namespace_map = dict()
     for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
         # The ns[0-9]+ namespaces are reserved for interal usage, so
         # we have to use an other nomenclature.
-        if re.match('^ns[0-9]+$', key):
-            key = 'mat%d' % cpt
-            cpt += 1
+        if re.match('^ns[0-9]+$', key, re.I):  #pragma: no cover
+            key = 'mat' + key[2:]
 
         namespace_map[key] = value
         ET.register_namespace(key, value)
@@ -59,11 +59,56 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         'word/fontTable.xml',
         'word/settings.xml',
         'word/styles.xml',
+
+        # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
+        'word/stylesWithEffects.xml',
     }
     files_to_omit = set(map(re.compile, {  # type: ignore
+        'word/webSettings.xml',
+        'word/theme',
         '^docProps/',
     }))
 
+    @staticmethod
+    def __remove_rsid(full_path: str) -> bool:
+        """ The method will remove "revision session ID".  We're '}rsid'
+        instead of proper parsing, since rsid can have multiple forms, like
+        `rsidRDefault`, `rsidR`, `rsids`, …
+
+        We're removing rsid tags in two times, because we can't modify
+        the xml while we're iterating on it.
+
+        For more details, see
+        - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
+        - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:
+            return False
+
+        # rsid, tags or attributes, are always under the `w` namespace
+        if 'w' not in namespace.keys():
+            return True
+
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        elements_to_remove = list()
+        for item in tree.iterfind('.//', namespace):
+            if '}rsid' in item.tag.strip().lower():  # resi as tag
+                elements_to_remove.append(item)
+                continue
+            for key in list(item.attrib.keys()):  # rsid as attribute
+                if '}rsid' in key.lower():
+                    del item.attrib[key]
+
+        for element in elements_to_remove:
+            parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
     @staticmethod
     def __remove_revisions(full_path: str) -> bool:
         """ In this function, we're changing the XML document in several
@@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
 
         if full_path.endswith('/word/document.xml'):
             # this file contains the revisions
-            return self.__remove_revisions(full_path)
+            if self.__remove_revisions(full_path) is False:
+                return False
+
+        if full_path.endswith('.xml'):
+            if self.__remove_rsid(full_path) is False:
+                return False
+
         return True
 
     def get_meta(self) -> Dict[str, str]:
diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx
new file mode 100644
index 0000000000000000000000000000000000000000..b40a3415ad56150781929feac25049fea41db8e7
GIT binary patch
literal 12163
zcmWIWW@Zs#U}NB5U|>*W;8baEYGh<!=woGI5N6<DigwP=D@n~Oi4UnPNG*=ltH{k!
zSj57lv4DYzVHAw$5ZE%s*T308pzXcdKlV-G%(tryj!pTt;Oh$CH6QorIo~`{my-Vb
z>w8%*?^O$zM(v$&!07$&Gu+0_7rmxmWU|r94!Y)FrNgA0yX=fk>iWyae@t_gXgTSU
z$-5+n_k`Qw)7=M4-ZCh!GMlkUiKYCw<te5u2aZ@7luU|V@46t`{;}T)enaNEyRYvS
zWco4)KMKg)@bCd+k)gh`N!-Q9vyM1->g-w{zh+O6+8phvB8?tD52f6Xd9>EqY3l6q
z4FP=XIhsF)wZD3@;o!D+GZ%f-YR*uqsSuiF@Z-|2p7gMDre7z>OwW9{rE;B$$=9i~
zK8G<WrLH@@#PXukOxgdAd_1p(KD^h=x-0zleu&1gO1?>UW#w{Gk5|21vhBv|w`I)&
z8cTwG_Wf%r41cjB^qot@q2H(X2fpbnZ+c=P75p*o5NGFqgC4hC!b%tRFyD{)onD-i
z(ieFt#b-~c$M3`c7rnjPv|?4VlqI{JMn;&+?GDqvE!F4V@vnM2&GFi!v*x!;esD4{
z{Qu9)fSPmVCQ5Dp%)r3l$Hc(E&A`DFUzD0ttgi<mM|13m&ZuzD_e{F&#q7w__MY{A
z;)z=G+gwVQZr<})Sg!P%xn<@X#&W(hOK$DkyI5gY!0otaU!I@#zc#yh_7&^pHf-w?
zBd;#FF+0Kies)Ff_HOoo$P>}Cn$mgQ&uz%e%fF^?GjYm7n;yf}iym!mb5yZYN;BTD
zIHvW)zK~rLWIQ>8`zM}}{@Sy8*S_Ol!kK5yTy`sxH#7ac!`9;wX7ShfV$G%MxYFL=
zINd3Ff;}<+re2a(j_0wW-Ym_pI#M|X=T+xUOj^Tb_8@c7)vqr9<GcG>bvPGtcAo!S
zQ#yw^DvrP8=Az2u>#gM#BXz`TtA9H;Z`RMSW5kw6_M504D`I3|IK;%jAi%)En3A6y
zP?TR#te;p=0Ik{>H5M>1!b5C?GUf#N`X4sn*{l3fe$~CmU5x@hacU+l79UjCHWyFX
zmFqIg#nWu<f4S+G)^$i*-aP;9^k?sc+MJRi2l*+ct4r)oUt;Oq(rR>7>+Vy2aq~=}
zMFy=Gu07DoHC&QiSI3rr&+1EHqDRM}me8o<FQ)adMkXsC>N(Y>we)Lm$CO`PH#da{
zy<dCBJVPlu<)-Q3WS8KC!iR2$wxq|%J*xNQ*0!#FXqzv~`D{Y+<1HO0;;k&DZg4o{
zExxcbYeKw`joJq_EuI>_HT(|_|57Tkt!0=URiUUHdHnhI18SOgESFCenP~I$yjPWU
zj^yO)@83?&lew{l^~6NoQ+vZVK9$^%$>o3Qi_x!!l*+)$hp*&wli8v*P4Y5w<G5r6
zZg4jA=FTzDRiEqZQZqmCc~SNKpP!6BNzZxnV`0+XQm0ANSSyQel?65EMmDjYlz-vJ
zU~|d4;_h9~&C3oYz4#gZ;!pMa{VMb0&l*|XZ{wA)J^3U2hd2YablR?1r(4R%!0?!f
zfk6;CohIiOrH&?4M6T=!^36YFz_Zu?R{e^i{K-sGYqGk(Df&6jGFj%kyY+~J+hjM-
z1s~o|-xjc9+u^*8_djp`JEx>-g6jtMWY?8OBJM1#3nEUoMy0=~HRM*;*wClt*B#{8
z)|YV6WdB~znSK$O53Wv;>Jd(0UcV&gP)VEFxtA=-r_B^r_S!gC7KBQ9&gDNVm3?Yk
z32%kN3E626G?w>0iZZw|SINq==$hxNxtdap=L|#?94)&aYaA0_$E^HzJ}Yn0(J6;y
zx88qg@@}hHy6%mQ{9z?WR;^l;b))Cu^Zt*||2$nAX)3bnTNmT0;B!LiF^rX6`u_hn
z?S5)*aDCbC1I%X_N|&CL?^<fUWu?{ftj*gC)_5IWtN+Gz-=yYLw~XihH%@&Mnim`J
z`q{r_+Khib-YEUid!?7>jq6|OPl=aTa{qPfZG5}n0CT?QC8-KGyEgyx-IAx+Rpzxk
zu~T`>+Bx~6hT@aeaVx^(qpzqv$=vc^yYF{bdgHa(|F)m0$C_4atfyNlF)}dhWM*KH
zVc=ja&o4^RhcrG?@{>z*Q}aqdDHhfc87*HK7#P+}_VsTufHa*i?G_1(?Y+54-1O$^
zbsz8O24}QPD>=shZ?EWPu2YNH^?cK2?K^+jb9dDKtD#?*<a#$ZEiF8xwBgj#V`5FV
zIr;v2ku55@dfMrZ8^k88-TZj*RoRe9hZJ>un*%*IYD9Qmi7D(`)e@ETMruQ%yLG%b
z*HZlo<*So*8uTZnoUFBPPHjGuF`4_B`A^O6ZMniH>eH>)_BLfNpO?MCr0D^N%fmDY
zqow){EVs)aJvjMe$Dv0JPPtRcUzi*{b@k8We~x=4;(C;~iauDp@RM71z?I|w@67t-
z!C%qS^HYBDixYbtOTLQKS*p)F#NC)*75*vDce%{D=Oq`;T?%`*YTohiY_CT-ex2br
z?*6Zuy7yUJ3%^~=H$H6T;ccBh;ZP<9hBP)%dC0&3PGiWaYf0$szFP(Ywd=3ccl_bw
z)Cmw_)ZQ4dg<pHE*Y6m~<DNQ6{i)JHzrM;oXN$g&rQanIf2{b~oS79yyZ<Y9KRD4V
zy5L#Lg?^6%qRlQVa{TK4{<$}M(gCYgLQxHmii;jiyQ9CaZsycNrZ);Zix{0Wq^Hk1
z>-la@-{A|J=6GlQ4l-aC3%Z`J;Or56up*+#^H8Em$EulT2X<_gc;C26dFztK$=5R$
z>^u~qDd9QyZSzSd&$Is)RClafDQq3cQSS66<I<r6b7XqDq@DKmO(^fob~<~ZYKFF`
z`J-#|R;w|G9=Y&=$+BV7&I{?1@~+OEmEJFZ%<5RIbE|l%T+g~krIYRlHBVirI_(ip
z1#7voq~}|eZY%zM`<}1**Kg4gCww{Nh;xa|J*S7MXQp*7aVj;C_dc6i^wF*6+3$N2
z7BBkx#O6C!hy3O{9e<|Wpj++g+1*pp?z0w@mTtGaI)`P_(f*lC&gHB|TI`Dt)K5%l
zJe$OR@?^i2MskkX$$ra&(N=Ok$Gk7)t*!9lZ~Z*&;jedzyVltBmKS%c2UOQ_rI?;^
z+W+)^#?kHjn&SKy7nTU`+E~%gR^={g_l<Q$I#2cexNUt>?INyCM{L4fRagDq+o+ma
zEmXDO+nXrWyPhUn&gi7SySc?^?>=rJtK0uvgFEk*TgZe87R~8jZn6GA={J|Xo0)rm
zFaFgb7}+;7Yr1s--;>JN4_^+=DxOvFCi{MyXbsl`y=gZOhMrcFX+D%>vRc!5=Dnp!
zjd$ykTc(F|9G%~qeBx7HP-I}b?#4+m)1|)Ol0WiD9;uw;Vg^+<JNqnrK{N~tF+kH`
zT7F(hNMceBxKc*#k9|4Kq!7!@z);J9lxB-lOG+~H(u+YQ;+C-6dABVDYU30BJ6`x~
zHA%)=nxn8QH|qVij_s~f>ljoonF-!ASh8kE{C!1DH{K%03yb{a&em4&wePDaR(CB+
zNZ1*+dXJUn=bn{*!RyLa|9O43s%&2RH9^<T3tekJDR2Hc`?UT49THim=Qx|sPUK3>
zzPa|(-r{9?T({rORQcv2z3jq{s|uZm{cng)I*^;6Hg{T^#l50~cQ?+M{xs5V*~Qg+
z*hBj@(<YTKSbNV(KIrw^Y2`V8JOw|@+GY4@?z^g!Q@BOB6PT(moO-u&qvq^CH*+i=
z*{NFYF*)Mc^~5~nsrS6cJDS<5<Uh{(R@1<E*Dza9LG#g$Nd=|zmqpolmDFERuejp6
zCiX{&MBG%zSytO01^yAva2Hu$$<3m4#)IL0>zh~n)=L|=FnOAWbpH{O{OX%MlXq+4
z1w{{!<D1TK=P$_du6mf~aoZyI!c&J0|L%Xjc;_wWgmZ=&JTD6#X|I*ZnD}SKzxdtS
zDKZZlkF-pzw=118>qKf#fJ>5lp!KKX#hRxtUSX~``LenF8MD)Li|82-Uph&>mn%3`
zU~*8ZTqLLWtoa}5-|iRt_m;{?%18X-mfdrOA$Z>>^*t-i-49>u*&)~e(p;3=V-2gh
zG~a@U&5JrduNTu6>X7q`Nx7o-u=|?c!3OQL$ho@rk8S>tH#zm+zb~);c%J@j`2PIq
zRLP_b{~|)K)p6Feo$l7C)G(IJEO1TXo|7WIUu}ZGro4yWbi+^HGm4q~#@+k!_LCN~
zU7M%6beI;|N*-dJVEXjhp5)sT!g4s?EZeh~-Dc&{>aQQ|rtdH2`Nw*9c0?{il9J2H
zn5_{#KX(*ek5SwE_O_a<=AKvfGpbzeWe=$9EPE`O)voWd;FoFO?iQBd&Fg#1U0qmB
z{)r|^C-KB(mSmP+)e^7SUOsjC^27RF>pg#Dlz+Zx#&7oL)3y(eR`!xr{Y~W^N>#mw
zmRS4ly6oA$+vu0B+FhfU(<Z;udHKOq`jzcHb%B_>k1lyEx}LvPpeo_Z{v|O_`$Iko
zyqPD_w&T~`_9fR!1A0zhNNb;U_u;gZDb_Vhc$ZJQ^K+@C{pW{^w=1~c-Oax5-m|%9
znRDGbl_lgBR0;H4c$ocvgZTc*)_o;8eVgWnz4gqADR?cFR-kRoYf%)@Qr~*!^8dEe
z?~(E@VpP*lxwqAtoq-|Oih+S2mUoLwDsxgHIdS&=;@K7g$M5Gq`^Vn){mRC<$FBR_
z^_#Wr`k5_zxA;AqmpUQYm$6X6-1Gmh=(Prl2?^#Gmd}vC)gQL|*Sa?z>+jz?KJj~S
z+RjTSFU{X)@y&K7i&e<Ux0m+s{8#a1`unGDDhoSGl&1e{KJ#~b{=Ywuzut|^lT(^m
zouF{?{G)`I`MsX!)h65AI2G|kzeG#8V#mcu9iEu9?UN%??8{yTE_fUwu{CU>5&z_y
zA3v^)pK($n>`U5Wrk`=_yCc-tZu=WaHoDGz5<T@<b#}Q(cDWVzWgm8<^9NsA6n#rt
zIO)=vFG~_N!W@?s?={pvFthXZ#Urzto?TA<BjUbKoIQQ|_Re!hteOt)o;~kJe!>$c
z**(*xH+K8v>2F@WW{1wY59i(I&bYJNTDWq-6;mEfi4M^lze?skIn}Nn^yHaM_@f7h
z*Vev%Fj>J|H2ZzXia(vI&-Nd9<}axG_hwH0^F!YroZRc7vi*0pk>#$)(CSTJPDf{N
z*!{(|Y>U<QV`?v+{mVak_e<yJg4S41SB8TOeK#aHJsB;NEGKTVw1{@tx1;;Lwe`BB
z^b$r#W`@<XT2J5Va9-!|Y)uT);ozM=*EGu;#Yk3c_%e~zJYR3U(NbfNOH2R1nwom(
z?9`sC>Y{Cz&P{)G&v#Sk!awEOlV``Y^?%=Ad*q3}4qMf9i5LG~GHq5kb(sHqqfhRw
zrk5@^IO@0ZDikWni1L+XESIZ(oOfkK#pnIDMX!=7CTyB|=v)E+@B5ri>&u%Rjb8^|
zW!!T5)$5uaos%D`#qpUpp0v)<5j$MXxXaE&NzYs>(vyGH#YXRauPn~Qmo8b8d-24g
zdt$vyUglmrG4Y;Q@01t07f&?Y6YFjGzAfX(_U*39bLQ$YH698*zsRzWEi)-zTiEmV
zzTk#s%uCm4nBJRu;z+LArM2nY*?VutChfenmUr!HjpgUXPftBrQgqu|DPZo~E4ACy
z>N_s4Jt4YgQdL`>S;33-Z&w}vZ5^Db|7xlCzBd0I29^<r)+q^|Slj0`v-0|<d#+P1
z{#*SmEN|+?+dCT*|E|f~ek?k7<CAZP{G!?R)~d|;dtU3f;fpyw%XQa%kyJnWVT-E%
z?cVx{3vV6$y-;zDo8R1d#)o2@6U2A7zc|vi!g6=j-VKZkc04%6dY^TR|BL<FdNsez
zj!pTuJLrqg@0NeN4}JCdJwyKP$5idJ)jux1nz-zGlWn<v$&$@y^=_SSjH`^;kY3Qa
zVCR8(N0!!3^3b_q{G+p`>bWDs{W}_!r}~aA?laPOQ((d6XSPfK&4Mc_`KLZ~$S#n|
zK9<0HIf|`Ach~)={;BoCkHwfnUKXvrX+8J3t}=sa$CkE#+b%Dw=e5ngXruMisByk`
z(aVb&pQp`qUn=_Z$tzjqV~4rqnN8L(895yGQe?g|N&ap2yiIDC6c2YRHy@q$bW>O2
zt%*}B4~9JxI-<>}eAV~RuhpsFwmo;+oF}C5hQai;TH~QUzqQk*mQFq(wKh*ELN|1=
zty`$?Q;VbPW`_ps6jG=#>8lWSc-&#m2xc5Ux?A38UWIT(POgUOR;_z10{agATovE6
z>X9$Q#;pfe#(tcAao@XzQ{U-tSMD%xJj53eyoXbytmM{_*#{1VTg(hdo*Lwy(5rlp
z<$}c{Uj|f0!N+&)J?i&Z#G<9WeAjlFGai}NE)*h~&%_b`FypY}$77NIZy%lQkjTD5
z^BxPAwUw!(uJ}q8F>43YFwJ``V*3txwl%e$JX@G0^HGhUyQAg20z&t2Dr_^85y_v^
zvp*K(&W2f!d>3q394ve#=sAZ@Swq&Epgo*AXQm%~<je3-`#fu*jg<QyPC2-<dBxw?
zov3)^o3L@~!Yfx+RR}xeRBAYeepG9?ajPI;PhsDmvX5#3HyE?J%s~O|8{qv>ZNm-5
zRU5rlDsMO@n9sx^T)d$|*x|=1V+V_Fi$sL!IK!f2EpOR;-8bT@`TldasW6t#KUf#e
zENPL=DI^+U@gp`bg{^%}%Kvv08V~Gk=SlLe3W%*){*}Lf+s5j9lN3^>M7euCp7xIY
z&eCAZ4uklTIVwB;JwNpO3Eu&eTZvw_cPp8fx34<h`k`Wytp&?p6MjvZo1WjTPp>t#
zP)IpnD(c?wq2rFK42M?V*Bi&a&EenE(Y~td_K*78O<eOEJ9uZWNZ?+>;OA1hpo{nY
zQ#qA8@8a0Ezv(*OXCrs=M)~F&vj2Cq6}`8;x%lSI{N(bc@-LF*UFCff+t=?{VP?(I
zzIlUpt!>MMwaVp3c<f@-6e~E|Sw8Mh)0^>dZ^5z$KU;ei=`-;!k(|42ztqXzLjUE*
zBN(%nUArYAyM6olf@X_vPbM&Wv}dK?Sz5Zt)_+p$o@tVEKD?6ny#7`gpH1gC6V<zM
z*QbRn==+j<|J(de=Kq<o_0u-Yy`iz2oq-|FmVrSEwVn;nEXi<9OG`~IfmXS>!M8zG
z?)@{h?EUwhJr+DUW%4%Q1+#H(znQ`JsEwkF4sNTdQz)4KFKp6U3F~d;$=^deli%it
z@BX!Z!KK~&=Mx{;d{hxNirR7cvGDbW6V+=4ZFTPa-hJM^ey888jVIC@dwv|MeYo=X
z!@u+E_uP(Ka?|@jFguUS%cExB#NMf$e$;(Z<*S)Z%F9BHJub_N=E_zZ%{lQks-kAo
z)K2@EbA9~3_!lbK&RgcNJL#gZ_uWG=H@2uH%Q0r_PM(nZa-)x&nxePd=N}wvj_U|(
z3wFF=&6z)EdfJmiXD7c{RPf@2)snUtiT1AgQ-Yq#UA8gCpQ5TNPb~HkR_EkDz%IE|
zTJZfLiz`wsJ2YdfUU5Cr<T>?y+s7|!4sKRVE-IM2L3c?~`i8{3^C>^l0wiWs2K>C6
zx;3w`>XN9>M8VWUzZwc3nr{B0lA^RJjc@j>S5FRANd(=G7n*te+S~}9&1J&2QPEkE
zLN?Zx*R|J6MR^%rh}d<Sw|nR7mU~CfI;#k8<2tbT?UyrwqA&Ncul%sI{Oold{!JmZ
zK6#TLC4cn%|2fhv@>Pz0#4FB1b|I!PCq}mYac(OM9!j6P^u|au=lyi)uh;J7?=e{&
z@mr(jp`VE1U;Z4sjR_CWr9GXj{q*4KbjEJEQ|UHVyW+!aBX@ke$n{1qW}1wy{?v_~
ze4GDl7s~siRq<m<Z0AgNZGMZKxrG9X0ft98TTZ>p;p<AX`*Z5*lT9|0SzV+VBChJx
ztT}6uA*^xrl7?B$>Ff8Gd_VCwl`UQ;=4?~@ck}I?6F5)2I8^&P%<$pEN$rnTf2dt2
z<?`{3-7Sv!|2IFpSbOU4-OuyqZ?zE@vz_TKxuf3hdZ@sXf6LSRk9(;dEIAU=R=<LA
zLE?lyE_drG-E+;KIrgrS{}-*aT&=xYQ~F$pck!10a}{Ju?IcWs1w)xHlzHx($7fz>
zcIU|KhiWAs`DS<jEYMF-Ha1aSW~LR|>7S)$8(ed>(xUcu;HE5>#a&hICSA+QT^74l
zxtn+`DtB4TRON0WbkBOx#QXOIdrQjtKFknXS0y_4)lCQ8H`WtbFQ`{f$+HT0)fM*m
z&ZZVy6`gw)t2SP{`%dU|>b_G2>#{6uV|1oRDqStxobI;3diB!k4QlmVk9oaW-<<mG
zz`XpQUvBLj{mzEB+5E41HTSvsR3&fP-g=kq%wuWEA2x;Wer!Iek@$!IrOsKs#n*Q>
zX8u)^Gn>i0+v#dib>ef~$@lDpkNq>B_{#Fdn`h5;7k-g+fBIq1QoY-~_0RX6I9!w^
zuzO<h;^!?U*9C4JeWz&^teLfL-93#ECIiJ8Wsdxgx7~l_&*Hwj@08)O`paE>TlzoT
zH(AI0_Vuq9=ky~p*F8M1#g|oi_GI>VFE*pd=Z-tss-7nL#T^uD<Zj&4v9@-K$C;Db
zKc?1LO`pfUe`n&WDRa8#_6cb`DUs-k%lQ#;!gJ-xds74iEgem-^&F5|wv|O7vZVf_
z`oe!4KDlgBOW($AF1MV&?F_>fg)4Hu()Bj~v;DHh_49@)vpN37Oe$h`x6X~MjED;K
zUv^)x$Y#w)L5IUu3XCReM1>r5zh2z;e50)(zvMj`PMu9UxAm5t=39HFc#~1Iz*^Rx
zE9Qc~*H628JC6J8zRm>`gs;4r&B0}9y)`cFw_1~F)P0u?QM=r#`&P~J;_SL(zFJ|o
zu!D<9UxhNL(q{rQpq0LIL`v?4Sy7_-EFSxveumy}TJ@-xVdB<<D`G#IFW&S%Q~U1z
zJADfGIJhhweLuPdoVb}OoX_ID-)Uuw)M+o-CQ}qMA9tAJFuX&DUn~S-$;Gz4kJg6&
z%@@ySX`QoRS%tF3lQ&mbuI2>^>J&9)>1f~Mm{!*s#KY<>Zmznb@QAeFMT<whphgA9
z1rP&TJNG6`+<Gu8D|C;b!X}%D05EO#Ml^q~ME!1%ku0W1r5zR1m#eJ^doBPsc-{1a
zk9rwihV!uoDoMHT5ftoIzDJy~ZguF!?t!|ul;ax6T`v#G=-lI&Q1_H!LY(UqlV0U}
z9UaF6^I0->RD5?nCY;aGy@od|cr_?=pB{vUvg6#%t)CaYc)arRG^GRcj5YjC7c_33
zrjpp`+OYq}EcJrb6TZ*qljP7*RuPCk#Vq~ET~l86&cSz=r!hG$KB6TMB>%GR;_?zL
zaY4W5vgVxc{_QK;H;db#w{X{zxcPe;Z#!Ro?ewc^(k=^@x-@;mn46uaCqF$U6(bPz
zeOH&Z!!N%VyfGYF>%Oiy_-Y>io)!9`qWxe0-xdq`A=y!tApA79;pxHN6?`Fu5AM!$
zymb8T>j%?<Ki-;ncWTe2vyV#t<ngE8t1flheEGf4T$cAABIk>|pQG%)KWWpg5&^yR
z%;~>uTBfXJDn80nx1+PBqagauIir(b=Fc=_y&wNk;m5fzty_GKy<N{%Gk0eGa`j!z
z*L>EdOgp)uwq_B3N4Ebr$%dj8R>^j&6mGp%Idxdnzw7a{wW=$v+V^#S+of_g^7^!(
z1LwY++x}_!m%sn9cC8KWI%}?AV_;Y!&A=cEtMW@SQgc)F!K5K*sOwFnfBtPVk^THP
z$`wlTlhw<l*h{-^Y@Ds@D}5xdT`se8+Kqrk4wpP@rEfiMU_87lv0d~=ZhK;Mqu7n5
zzfEph<;Y%Gzrj6d($R>xNoF^?*4n8E{e1lUznuKa)z{x0=aCgs)bzg<_qXV*v&zRx
zhshh>-j(~NzqZmtYYxkdk1bpFYEHbjwfFn|-Cfs1U6Ps-YL%`R-tL~_b$$A@CH#jY
z;+|=GCgh!Q?GfG6RiG*GgX6`qoXnkJ(Xm~IzBW@HNSwGQ)xXmJLcruNW<`%|+17VX
zsydLKw7^q1z4ag)dui{4;G3yTvPsvzHTWOsZ+o)ulfdDigD$sM&z!$~OJOPJ`t<G@
zQ5&{xS(jgTb61XI!G@fQu7VJQwcn-JYDcryGACroU4Q*`*WN0}^9&o_H!lmDW^~rA
ze(k-JnbRD;70xT$er=6uOiky%Q*+L&^;>>9Cv{WABDNHH>Ea(-Z)GJH-#jQi;i&oZ
z554Eow(oxRfcM0s@aG@?YOlT8)@{C^<KmM9zO}Nq-lt#TJ~3nSTdyBd2d-yH*0eQc
zXe{#HEb-Yzw`;4!{l|;=8F*77F0!o5^g4E<UE|%O3KflOJRhtME|s)?a(buYS#Hnx
z3n5$g@(1%s8ZQ3+>gV_M^Xy%Z^UssD|MBw2!^4l+?W3H|t;(I#@-$R`|KB&~kIOH%
zpI7(!(?#*C6_LmO|2nL`|IeRFQ-MqAYUjH3=l}Wk=4AJJyT5;TyNiDCZFEolG2xiY
zqS;S4-hP%4F%BzKRaY+DCwiFU@~5t|DpM>D&hlpcJFiRT%UR8|ndLH;R!6ozkY#+j
zn5&|;=)7QsKxgM7(K+RdZZN(pI%gFV5|!xwS!-sch3BEHXGuZF-X_;X9Npb5e0=|2
zPp(p9esA}Wc`h=B-zEG+_&l9NE*1Z2n85Y+UC^r=rPF^kdE14(&~ugb(e^c3qO3kE
zt$SN0Pu<M0mpuQ<ayD19ms+g8RMjoB%i{#s8Lbw1p=UEbTFujt37;tQXvS3DI_A|Y
zyiObp`H>pqul1qCL&AM;gGc1nXVVyJz5VVV4A`i3ci-t{8@oli<U%g)X|&W=-u}bU
z{7_+3tLz08tz`QN>;j7<-WG<k-hZ0c_M-d)v!B?m-+ZclyZzU*ZO?qsVZtq7Yb+?b
zZ;#&&;r-2;(#c6)ydtH`!;YK$+<oY*Mo8Z2hmW|-r<@Xxw_i8mcdqWu*&C`upPu>F
z>9m)le3MQ2PL3D1<L7SwpmI}rg23nKE4v^4S!YrG|L3DU8uRt{shnSBAd+#n`-a=?
zEq6a2T378ODZN5d*?dXn9@RItj`D0@t%@J19M#n6eGr&#WKdAB@IV@`P)w~$s7Xfw
zmv8mm%PvkcC-SL!?QA)`NOIkhwI471(5}9c^UAqnt&Gs>Sh*VyMQ!KIH{~*NTXjJu
zqu5P*)`7^GYhzA!H07;7yk=dimGSmnf@eis+oKn5GTUf5rTHZH)r1Wj)J#k{q7*lL
z(wg(rG~41$f*VtA`Rm1ZbdPb1zf_WZ*pOm(sp|ai6?t=}96vQjDT-g3=huUZuB%>m
zqGr93F0k&NeCmx9_l>9bs)hBR3fg=Y3b3Bpr=zK^?tLNfcDCvC*>ax>Z+$yf;MaEC
zf_1(3UH2rhL#dP3xA6WBo_sUq=b71SV+_qXUln{!d~3ISLoj=g;#IL}6M1=!r?xV9
zo2YC5FA_de704A5vUs!S#!YMoKdkp;EGvoiK9>Aw^`_0<%D2kCnQAYsS=1rna!^oe
zW9N<I59(xBE%0|xjVYhewIKdMo?rBBpPff7y?1@;opJDH;h9Su=}*I!XUsb(md6#O
z*j%aI$-XP}RNfTp{aR{I^*yE*N&G(_a`89k9lJFwx0vl>c)5E@mTj@s3puioYpLMZ
z-%L{M!PW{<59-91DXw_XbnB1!lYsx*Ez>h+Y-w{|a%;JwkjOV>QBJ3C3v_0_ux^d8
zy}et<bDE%$V5m^|vamOTmMS?bvLCFvciUFPamDPcNmCh{ttIQ<RZnwV9AIwww5C`q
za^v?~GC!7E%g;WuU`lkmuk+;F`vo>;@Tcuw6`Mcpd3AoIpsiu{W~s_BhRv74E}Xjb
z&(zi8d04jgwK>O5pN+Bj@MzC5g9sBB%{z*Uf7!p;Yt^LfFMGjTdgA-SZAw2X7Ir$`
za0rRt)XRQ}^%7s-g2GMBW;0AQvu~{_y%cuZ{5AKx67GHKJ!@61yZ4{Ze6z^1_@?E~
ziiyY0>Abmdt54U&wBlPz*P#m*_9rBc&p4m*R6nw&gkNSa>wO+IA5USQ4|BZ|UMyr+
z<5&2p>%O_<u|C$SO?-iv9}^=3!x}~g1`$}*R-T#^j5LYUbK#&Dv!MXngWV6Tb#`vQ
zIaO&>$##XHk4jCy7&6Psq|c-r^2u7`pMPtWLR{5pm6`oB>v#Tgh~LZHz4*e)w7At=
zO-m29t_)tU|N3U{DwUZA68?-<TiF9w|NE3Q^ES`ybdQyPrmWI%jM_5y)-sXkwMX+9
zd(UV(Y4j@_&I<9GXChU1`{+#79<`mlpY`5;4ULUoo!@poFhgc;tMw-9mI-}-XB`xu
z*pw4HMP7IQ;?zfqMv(%VzZck7X2kqRopscnV{WRD)a5-oi~WUPGKb!p{j0)hz1Ehu
zhebNh9-Gcmyg8t@vp=cmg<Ya=&FLwu5%TMLzfYMj_uhNq|CBrR@D)~!Ou7t+v3ArI
z#IPAN7>#8;aez0f0jLWZVH!d7E@lP>$jSonY65hPsH?j`8esSy8v_ISS}%0nsB3Ou
zIze<BCz5X13LJFpH7Ls<K-yr~hzB}Z2yr^{v^7?5pig$g%wb@d#s@VC*$j|F?3r=R
zQ>s99D>Q6rloEt$N13(cVo(69VPueCXh^9&As2Fm5l$~TAtE1gg%M;2avY=2ZGsH|
z6J9VokY_s4wWCkRfOUe2Ya&o57Q-iJ&^4ovF@g1g2|F1S&1fS|=;oj|T){?w2}2dA
sIp78?QZp9a2=oF1tQ$=5XhV%aD=q@OS=m4eco=vXbeS0#9CScD0Ln~HcmMzZ

literal 0
HcmV?d00001

diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py
index 3d1c8e1..82579a3 100644
--- a/tests/test_deep_cleaning.py
+++ b/tests/test_deep_cleaning.py
@@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
 
         os.remove('./tests/data/clean.odt')
         os.remove('./tests/data/clean.cleaned.odt')
+
+class TestRsidRemoval(unittest.TestCase):
+    def test_office(self):
+        shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
+        p = office.MSOfficeParser('./tests/data/clean.docx')
+
+        meta = p.get_meta()
+        self.assertIsNotNone(meta)
+
+        how_many_rsid = False
+        with zipfile.ZipFile('./tests/data/clean.docx') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
+                how_many_rsid += num
+        self.assertEqual(how_many_rsid, 11)
+
+        ret = p.remove_all()
+        self.assertTrue(ret)
+
+        with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
+            for item in zin.infolist():
+                if not item.filename.endswith('.xml'):
+                    continue
+                num = zin.read(item).decode('utf-8').lower().count('w:rsid')
+                self.assertEqual(num, 0)
+
+        os.remove('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.cleaned.docx')
+
-- 
GitLab