From 174d4a0ac09c2e9d4a9aa3677a442c05459b8309 Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Thu, 20 Sep 2018 22:37:53 +0200 Subject: [PATCH] Implement rsid stripping for office files MS Office XML rsid is a "unique identifier used to track the editing session when the physical character representing this section mark was last formatted." See the following links for details: - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/. --- libmat2/office.py | 61 ++++++++++++++++++-- tests/data/office_revision_session_ids.docx | Bin 0 -> 12163 bytes tests/test_deep_cleaning.py | 31 ++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 tests/data/office_revision_session_ids.docx diff --git a/libmat2/office.py b/libmat2/office.py index 5c2c996..07bbbb9 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore from .archive import ArchiveBasedAbstractParser +# pylint: disable=line-too-long + # Make pyflakes happy assert Set assert Pattern @@ -15,14 +17,12 @@ assert Pattern def _parse_xml(full_path: str): """ This function parses XML, with namespace support. """ - cpt = 0 namespace_map = dict() for _, (key, value) in ET.iterparse(full_path, ("start-ns", )): # The ns[0-9]+ namespaces are reserved for interal usage, so # we have to use an other nomenclature. - if re.match('^ns[0-9]+$', key): - key = 'mat%d' % cpt - cpt += 1 + if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover + key = 'mat' + key[2:] namespace_map[key] = value ET.register_namespace(key, value) @@ -59,11 +59,56 @@ class MSOfficeParser(ArchiveBasedAbstractParser): 'word/fontTable.xml', 'word/settings.xml', 'word/styles.xml', + + # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx + 'word/stylesWithEffects.xml', } files_to_omit = set(map(re.compile, { # type: ignore + 'word/webSettings.xml', + 'word/theme', '^docProps/', })) + @staticmethod + def __remove_rsid(full_path: str) -> bool: + """ The method will remove "revision session ID". We're '}rsid' + instead of proper parsing, since rsid can have multiple forms, like + `rsidRDefault`, `rsidR`, `rsids`, … + + We're removing rsid tags in two times, because we can't modify + the xml while we're iterating on it. + + For more details, see + - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx + - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError: + return False + + # rsid, tags or attributes, are always under the `w` namespace + if 'w' not in namespace.keys(): + return True + + parent_map = {c:p for p in tree.iter() for c in p} + + elements_to_remove = list() + for item in tree.iterfind('.//', namespace): + if '}rsid' in item.tag.strip().lower(): # resi as tag + elements_to_remove.append(item) + continue + for key in list(item.attrib.keys()): # rsid as attribute + if '}rsid' in key.lower(): + del item.attrib[key] + + for element in elements_to_remove: + parent_map[element].remove(element) + + tree.write(full_path, xml_declaration=True) + + return True + @staticmethod def __remove_revisions(full_path: str) -> bool: """ In this function, we're changing the XML document in several @@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser): if full_path.endswith('/word/document.xml'): # this file contains the revisions - return self.__remove_revisions(full_path) + if self.__remove_revisions(full_path) is False: + return False + + if full_path.endswith('.xml'): + if self.__remove_rsid(full_path) is False: + return False + return True def get_meta(self) -> Dict[str, str]: diff --git a/tests/data/office_revision_session_ids.docx b/tests/data/office_revision_session_ids.docx new file mode 100644 index 0000000000000000000000000000000000000000..b40a3415ad56150781929feac25049fea41db8e7 GIT binary patch literal 12163 zcmWIWW@Zs#U}NB5U|>*W;8baEYGh<!=woGI5N6<DigwP=D@n~Oi4UnPNG*=ltH{k! zSj57lv4DYzVHAw$5ZE%s*T308pzXcdKlV-G%(tryj!pTt;Oh$CH6QorIo~`{my-Vb z>w8%*?^O$zM(v$&!07$&Gu+0_7rmxmWU|r94!Y)FrNgA0yX=fk>iWyae@t_gXgTSU z$-5+n_k`Qw)7=M4-ZCh!GMlkUiKYCw<te5u2aZ@7luU|V@46t`{;}T)enaNEyRYvS zWco4)KMKg)@bCd+k)gh`N!-Q9vyM1->g-w{zh+O6+8phvB8?tD52f6Xd9>EqY3l6q z4FP=XIhsF)wZD3@;o!D+GZ%f-YR*uqsSuiF@Z-|2p7gMDre7z>OwW9{rE;B$$=9i~ zK8G<WrLH@@#PXukOxgdAd_1p(KD^h=x-0zleu&1gO1?>UW#w{Gk5|21vhBv|w`I)& z8cTwG_Wf%r41cjB^qot@q2H(X2fpbnZ+c=P75p*o5NGFqgC4hC!b%tRFyD{)onD-i z(ieFt#b-~c$M3`c7rnjPv|?4VlqI{JMn;&+?GDqvE!F4V@vnM2&GFi!v*x!;esD4{ z{Qu9)fSPmVCQ5Dp%)r3l$Hc(E&A`DFUzD0ttgi<mM|13m&ZuzD_e{F&#q7w__MY{A z;)z=G+gwVQZr<})Sg!P%xn<@X#&W(hOK$DkyI5gY!0otaU!I@#zc#yh_7&^pHf-w? zBd;#FF+0Kies)Ff_HOoo$P>}Cn$mgQ&uz%e%fF^?GjYm7n;yf}iym!mb5yZYN;BTD zIHvW)zK~rLWIQ>8`zM}}{@Sy8*S_Ol!kK5yTy`sxH#7ac!`9;wX7ShfV$G%MxYFL= zINd3Ff;}<+re2a(j_0wW-Ym_pI#M|X=T+xUOj^Tb_8@c7)vqr9<GcG>bvPGtcAo!S zQ#yw^DvrP8=Az2u>#gM#BXz`TtA9H;Z`RMSW5kw6_M504D`I3|IK;%jAi%)En3A6y zP?TR#te;p=0Ik{>H5M>1!b5C?GUf#N`X4sn*{l3fe$~CmU5x@hacU+l79UjCHWyFX zmFqIg#nWu<f4S+G)^$i*-aP;9^k?sc+MJRi2l*+ct4r)oUt;Oq(rR>7>+Vy2aq~=} zMFy=Gu07DoHC&QiSI3rr&+1EHqDRM}me8o<FQ)adMkXsC>N(Y>we)Lm$CO`PH#da{ zy<dCBJVPlu<)-Q3WS8KC!iR2$wxq|%J*xNQ*0!#FXqzv~`D{Y+<1HO0;;k&DZg4o{ zExxcbYeKw`joJq_EuI>_HT(|_|57Tkt!0=URiUUHdHnhI18SOgESFCenP~I$yjPWU zj^yO)@83?&lew{l^~6NoQ+vZVK9$^%$>o3Qi_x!!l*+)$hp*&wli8v*P4Y5w<G5r6 zZg4jA=FTzDRiEqZQZqmCc~SNKpP!6BNzZxnV`0+XQm0ANSSyQel?65EMmDjYlz-vJ zU~|d4;_h9~&C3oYz4#gZ;!pMa{VMb0&l*|XZ{wA)J^3U2hd2YablR?1r(4R%!0?!f zfk6;CohIiOrH&?4M6T=!^36YFz_Zu?R{e^i{K-sGYqGk(Df&6jGFj%kyY+~J+hjM- z1s~o|-xjc9+u^*8_djp`JEx>-g6jtMWY?8OBJM1#3nEUoMy0=~HRM*;*wClt*B#{8 z)|YV6WdB~znSK$O53Wv;>Jd(0UcV&gP)VEFxtA=-r_B^r_S!gC7KBQ9&gDNVm3?Yk z32%kN3E626G?w>0iZZw|SINq==$hxNxtdap=L|#?94)&aYaA0_$E^HzJ}Yn0(J6;y zx88qg@@}hHy6%mQ{9z?WR;^l;b))Cu^Zt*||2$nAX)3bnTNmT0;B!LiF^rX6`u_hn z?S5)*aDCbC1I%X_N|&CL?^<fUWu?{ftj*gC)_5IWtN+Gz-=yYLw~XihH%@&Mnim`J z`q{r_+Khib-YEUid!?7>jq6|OPl=aTa{qPfZG5}n0CT?QC8-KGyEgyx-IAx+Rpzxk zu~T`>+Bx~6hT@aeaVx^(qpzqv$=vc^yYF{bdgHa(|F)m0$C_4atfyNlF)}dhWM*KH zVc=ja&o4^RhcrG?@{>z*Q}aqdDHhfc87*HK7#P+}_VsTufHa*i?G_1(?Y+54-1O$^ zbsz8O24}QPD>=shZ?EWPu2YNH^?cK2?K^+jb9dDKtD#?*<a#$ZEiF8xwBgj#V`5FV zIr;v2ku55@dfMrZ8^k88-TZj*RoRe9hZJ>un*%*IYD9Qmi7D(`)e@ETMruQ%yLG%b z*HZlo<*So*8uTZnoUFBPPHjGuF`4_B`A^O6ZMniH>eH>)_BLfNpO?MCr0D^N%fmDY zqow){EVs)aJvjMe$Dv0JPPtRcUzi*{b@k8We~x=4;(C;~iauDp@RM71z?I|w@67t- z!C%qS^HYBDixYbtOTLQKS*p)F#NC)*75*vDce%{D=Oq`;T?%`*YTohiY_CT-ex2br z?*6Zuy7yUJ3%^~=H$H6T;ccBh;ZP<9hBP)%dC0&3PGiWaYf0$szFP(Ywd=3ccl_bw z)Cmw_)ZQ4dg<pHE*Y6m~<DNQ6{i)JHzrM;oXN$g&rQanIf2{b~oS79yyZ<Y9KRD4V zy5L#Lg?^6%qRlQVa{TK4{<$}M(gCYgLQxHmii;jiyQ9CaZsycNrZ);Zix{0Wq^Hk1 z>-la@-{A|J=6GlQ4l-aC3%Z`J;Or56up*+#^H8Em$EulT2X<_gc;C26dFztK$=5R$ z>^u~qDd9QyZSzSd&$Is)RClafDQq3cQSS66<I<r6b7XqDq@DKmO(^fob~<~ZYKFF` z`J-#|R;w|G9=Y&=$+BV7&I{?1@~+OEmEJFZ%<5RIbE|l%T+g~krIYRlHBVirI_(ip z1#7voq~}|eZY%zM`<}1**Kg4gCww{Nh;xa|J*S7MXQp*7aVj;C_dc6i^wF*6+3$N2 z7BBkx#O6C!hy3O{9e<|Wpj++g+1*pp?z0w@mTtGaI)`P_(f*lC&gHB|TI`Dt)K5%l zJe$OR@?^i2MskkX$$ra&(N=Ok$Gk7)t*!9lZ~Z*&;jedzyVltBmKS%c2UOQ_rI?;^ z+W+)^#?kHjn&SKy7nTU`+E~%gR^={g_l<Q$I#2cexNUt>?INyCM{L4fRagDq+o+ma zEmXDO+nXrWyPhUn&gi7SySc?^?>=rJtK0uvgFEk*TgZe87R~8jZn6GA={J|Xo0)rm zFaFgb7}+;7Yr1s--;>JN4_^+=DxOvFCi{MyXbsl`y=gZOhMrcFX+D%>vRc!5=Dnp! zjd$ykTc(F|9G%~qeBx7HP-I}b?#4+m)1|)Ol0WiD9;uw;Vg^+<JNqnrK{N~tF+kH` zT7F(hNMceBxKc*#k9|4Kq!7!@z);J9lxB-lOG+~H(u+YQ;+C-6dABVDYU30BJ6`x~ zHA%)=nxn8QH|qVij_s~f>ljoonF-!ASh8kE{C!1DH{K%03yb{a&em4&wePDaR(CB+ zNZ1*+dXJUn=bn{*!RyLa|9O43s%&2RH9^<T3tekJDR2Hc`?UT49THim=Qx|sPUK3> zzPa|(-r{9?T({rORQcv2z3jq{s|uZm{cng)I*^;6Hg{T^#l50~cQ?+M{xs5V*~Qg+ z*hBj@(<YTKSbNV(KIrw^Y2`V8JOw|@+GY4@?z^g!Q@BOB6PT(moO-u&qvq^CH*+i= z*{NFYF*)Mc^~5~nsrS6cJDS<5<Uh{(R@1<E*Dza9LG#g$Nd=|zmqpolmDFERuejp6 zCiX{&MBG%zSytO01^yAva2Hu$$<3m4#)IL0>zh~n)=L|=FnOAWbpH{O{OX%MlXq+4 z1w{{!<D1TK=P$_du6mf~aoZyI!c&J0|L%Xjc;_wWgmZ=&JTD6#X|I*ZnD}SKzxdtS zDKZZlkF-pzw=118>qKf#fJ>5lp!KKX#hRxtUSX~``LenF8MD)Li|82-Uph&>mn%3` zU~*8ZTqLLWtoa}5-|iRt_m;{?%18X-mfdrOA$Z>>^*t-i-49>u*&)~e(p;3=V-2gh zG~a@U&5JrduNTu6>X7q`Nx7o-u=|?c!3OQL$ho@rk8S>tH#zm+zb~);c%J@j`2PIq zRLP_b{~|)K)p6Feo$l7C)G(IJEO1TXo|7WIUu}ZGro4yWbi+^HGm4q~#@+k!_LCN~ zU7M%6beI;|N*-dJVEXjhp5)sT!g4s?EZeh~-Dc&{>aQQ|rtdH2`Nw*9c0?{il9J2H zn5_{#KX(*ek5SwE_O_a<=AKvfGpbzeWe=$9EPE`O)voWd;FoFO?iQBd&Fg#1U0qmB z{)r|^C-KB(mSmP+)e^7SUOsjC^27RF>pg#Dlz+Zx#&7oL)3y(eR`!xr{Y~W^N>#mw zmRS4ly6oA$+vu0B+FhfU(<Z;udHKOq`jzcHb%B_>k1lyEx}LvPpeo_Z{v|O_`$Iko zyqPD_w&T~`_9fR!1A0zhNNb;U_u;gZDb_Vhc$ZJQ^K+@C{pW{^w=1~c-Oax5-m|%9 znRDGbl_lgBR0;H4c$ocvgZTc*)_o;8eVgWnz4gqADR?cFR-kRoYf%)@Qr~*!^8dEe z?~(E@VpP*lxwqAtoq-|Oih+S2mUoLwDsxgHIdS&=;@K7g$M5Gq`^Vn){mRC<$FBR_ z^_#Wr`k5_zxA;AqmpUQYm$6X6-1Gmh=(Prl2?^#Gmd}vC)gQL|*Sa?z>+jz?KJj~S z+RjTSFU{X)@y&K7i&e<Ux0m+s{8#a1`unGDDhoSGl&1e{KJ#~b{=Ywuzut|^lT(^m zouF{?{G)`I`MsX!)h65AI2G|kzeG#8V#mcu9iEu9?UN%??8{yTE_fUwu{CU>5&z_y zA3v^)pK($n>`U5Wrk`=_yCc-tZu=WaHoDGz5<T@<b#}Q(cDWVzWgm8<^9NsA6n#rt zIO)=vFG~_N!W@?s?={pvFthXZ#Urzto?TA<BjUbKoIQQ|_Re!hteOt)o;~kJe!>$c z**(*xH+K8v>2F@WW{1wY59i(I&bYJNTDWq-6;mEfi4M^lze?skIn}Nn^yHaM_@f7h z*Vev%Fj>J|H2ZzXia(vI&-Nd9<}axG_hwH0^F!YroZRc7vi*0pk>#$)(CSTJPDf{N z*!{(|Y>U<QV`?v+{mVak_e<yJg4S41SB8TOeK#aHJsB;NEGKTVw1{@tx1;;Lwe`BB z^b$r#W`@<XT2J5Va9-!|Y)uT);ozM=*EGu;#Yk3c_%e~zJYR3U(NbfNOH2R1nwom( z?9`sC>Y{Cz&P{)G&v#Sk!awEOlV``Y^?%=Ad*q3}4qMf9i5LG~GHq5kb(sHqqfhRw zrk5@^IO@0ZDikWni1L+XESIZ(oOfkK#pnIDMX!=7CTyB|=v)E+@B5ri>&u%Rjb8^| zW!!T5)$5uaos%D`#qpUpp0v)<5j$MXxXaE&NzYs>(vyGH#YXRauPn~Qmo8b8d-24g zdt$vyUglmrG4Y;Q@01t07f&?Y6YFjGzAfX(_U*39bLQ$YH698*zsRzWEi)-zTiEmV zzTk#s%uCm4nBJRu;z+LArM2nY*?VutChfenmUr!HjpgUXPftBrQgqu|DPZo~E4ACy z>N_s4Jt4YgQdL`>S;33-Z&w}vZ5^Db|7xlCzBd0I29^<r)+q^|Slj0`v-0|<d#+P1 z{#*SmEN|+?+dCT*|E|f~ek?k7<CAZP{G!?R)~d|;dtU3f;fpyw%XQa%kyJnWVT-E% z?cVx{3vV6$y-;zDo8R1d#)o2@6U2A7zc|vi!g6=j-VKZkc04%6dY^TR|BL<FdNsez zj!pTuJLrqg@0NeN4}JCdJwyKP$5idJ)jux1nz-zGlWn<v$&$@y^=_SSjH`^;kY3Qa zVCR8(N0!!3^3b_q{G+p`>bWDs{W}_!r}~aA?laPOQ((d6XSPfK&4Mc_`KLZ~$S#n| zK9<0HIf|`Ach~)={;BoCkHwfnUKXvrX+8J3t}=sa$CkE#+b%Dw=e5ngXruMisByk` z(aVb&pQp`qUn=_Z$tzjqV~4rqnN8L(895yGQe?g|N&ap2yiIDC6c2YRHy@q$bW>O2 zt%*}B4~9JxI-<>}eAV~RuhpsFwmo;+oF}C5hQai;TH~QUzqQk*mQFq(wKh*ELN|1= zty`$?Q;VbPW`_ps6jG=#>8lWSc-&#m2xc5Ux?A38UWIT(POgUOR;_z10{agATovE6 z>X9$Q#;pfe#(tcAao@XzQ{U-tSMD%xJj53eyoXbytmM{_*#{1VTg(hdo*Lwy(5rlp z<$}c{Uj|f0!N+&)J?i&Z#G<9WeAjlFGai}NE)*h~&%_b`FypY}$77NIZy%lQkjTD5 z^BxPAwUw!(uJ}q8F>43YFwJ``V*3txwl%e$JX@G0^HGhUyQAg20z&t2Dr_^85y_v^ zvp*K(&W2f!d>3q394ve#=sAZ@Swq&Epgo*AXQm%~<je3-`#fu*jg<QyPC2-<dBxw? zov3)^o3L@~!Yfx+RR}xeRBAYeepG9?ajPI;PhsDmvX5#3HyE?J%s~O|8{qv>ZNm-5 zRU5rlDsMO@n9sx^T)d$|*x|=1V+V_Fi$sL!IK!f2EpOR;-8bT@`TldasW6t#KUf#e zENPL=DI^+U@gp`bg{^%}%Kvv08V~Gk=SlLe3W%*){*}Lf+s5j9lN3^>M7euCp7xIY z&eCAZ4uklTIVwB;JwNpO3Eu&eTZvw_cPp8fx34<h`k`Wytp&?p6MjvZo1WjTPp>t# zP)IpnD(c?wq2rFK42M?V*Bi&a&EenE(Y~td_K*78O<eOEJ9uZWNZ?+>;OA1hpo{nY zQ#qA8@8a0Ezv(*OXCrs=M)~F&vj2Cq6}`8;x%lSI{N(bc@-LF*UFCff+t=?{VP?(I zzIlUpt!>MMwaVp3c<f@-6e~E|Sw8Mh)0^>dZ^5z$KU;ei=`-;!k(|42ztqXzLjUE* zBN(%nUArYAyM6olf@X_vPbM&Wv}dK?Sz5Zt)_+p$o@tVEKD?6ny#7`gpH1gC6V<zM z*QbRn==+j<|J(de=Kq<o_0u-Yy`iz2oq-|FmVrSEwVn;nEXi<9OG`~IfmXS>!M8zG z?)@{h?EUwhJr+DUW%4%Q1+#H(znQ`JsEwkF4sNTdQz)4KFKp6U3F~d;$=^deli%it z@BX!Z!KK~&=Mx{;d{hxNirR7cvGDbW6V+=4ZFTPa-hJM^ey888jVIC@dwv|MeYo=X z!@u+E_uP(Ka?|@jFguUS%cExB#NMf$e$;(Z<*S)Z%F9BHJub_N=E_zZ%{lQks-kAo z)K2@EbA9~3_!lbK&RgcNJL#gZ_uWG=H@2uH%Q0r_PM(nZa-)x&nxePd=N}wvj_U|( z3wFF=&6z)EdfJmiXD7c{RPf@2)snUtiT1AgQ-Yq#UA8gCpQ5TNPb~HkR_EkDz%IE| zTJZfLiz`wsJ2YdfUU5Cr<T>?y+s7|!4sKRVE-IM2L3c?~`i8{3^C>^l0wiWs2K>C6 zx;3w`>XN9>M8VWUzZwc3nr{B0lA^RJjc@j>S5FRANd(=G7n*te+S~}9&1J&2QPEkE zLN?Zx*R|J6MR^%rh}d<Sw|nR7mU~CfI;#k8<2tbT?UyrwqA&Ncul%sI{Oold{!JmZ zK6#TLC4cn%|2fhv@>Pz0#4FB1b|I!PCq}mYac(OM9!j6P^u|au=lyi)uh;J7?=e{& z@mr(jp`VE1U;Z4sjR_CWr9GXj{q*4KbjEJEQ|UHVyW+!aBX@ke$n{1qW}1wy{?v_~ ze4GDl7s~siRq<m<Z0AgNZGMZKxrG9X0ft98TTZ>p;p<AX`*Z5*lT9|0SzV+VBChJx ztT}6uA*^xrl7?B$>Ff8Gd_VCwl`UQ;=4?~@ck}I?6F5)2I8^&P%<$pEN$rnTf2dt2 z<?`{3-7Sv!|2IFpSbOU4-OuyqZ?zE@vz_TKxuf3hdZ@sXf6LSRk9(;dEIAU=R=<LA zLE?lyE_drG-E+;KIrgrS{}-*aT&=xYQ~F$pck!10a}{Ju?IcWs1w)xHlzHx($7fz> zcIU|KhiWAs`DS<jEYMF-Ha1aSW~LR|>7S)$8(ed>(xUcu;HE5>#a&hICSA+QT^74l zxtn+`DtB4TRON0WbkBOx#QXOIdrQjtKFknXS0y_4)lCQ8H`WtbFQ`{f$+HT0)fM*m z&ZZVy6`gw)t2SP{`%dU|>b_G2>#{6uV|1oRDqStxobI;3diB!k4QlmVk9oaW-<<mG zz`XpQUvBLj{mzEB+5E41HTSvsR3&fP-g=kq%wuWEA2x;Wer!Iek@$!IrOsKs#n*Q> zX8u)^Gn>i0+v#dib>ef~$@lDpkNq>B_{#Fdn`h5;7k-g+fBIq1QoY-~_0RX6I9!w^ zuzO<h;^!?U*9C4JeWz&^teLfL-93#ECIiJ8Wsdxgx7~l_&*Hwj@08)O`paE>TlzoT zH(AI0_Vuq9=ky~p*F8M1#g|oi_GI>VFE*pd=Z-tss-7nL#T^uD<Zj&4v9@-K$C;Db zKc?1LO`pfUe`n&WDRa8#_6cb`DUs-k%lQ#;!gJ-xds74iEgem-^&F5|wv|O7vZVf_ z`oe!4KDlgBOW($AF1MV&?F_>fg)4Hu()Bj~v;DHh_49@)vpN37Oe$h`x6X~MjED;K zUv^)x$Y#w)L5IUu3XCReM1>r5zh2z;e50)(zvMj`PMu9UxAm5t=39HFc#~1Iz*^Rx zE9Qc~*H628JC6J8zRm>`gs;4r&B0}9y)`cFw_1~F)P0u?QM=r#`&P~J;_SL(zFJ|o zu!D<9UxhNL(q{rQpq0LIL`v?4Sy7_-EFSxveumy}TJ@-xVdB<<D`G#IFW&S%Q~U1z zJADfGIJhhweLuPdoVb}OoX_ID-)Uuw)M+o-CQ}qMA9tAJFuX&DUn~S-$;Gz4kJg6& z%@@ySX`QoRS%tF3lQ&mbuI2>^>J&9)>1f~Mm{!*s#KY<>Zmznb@QAeFMT<whphgA9 z1rP&TJNG6`+<Gu8D|C;b!X}%D05EO#Ml^q~ME!1%ku0W1r5zR1m#eJ^doBPsc-{1a zk9rwihV!uoDoMHT5ftoIzDJy~ZguF!?t!|ul;ax6T`v#G=-lI&Q1_H!LY(UqlV0U} z9UaF6^I0->RD5?nCY;aGy@od|cr_?=pB{vUvg6#%t)CaYc)arRG^GRcj5YjC7c_33 zrjpp`+OYq}EcJrb6TZ*qljP7*RuPCk#Vq~ET~l86&cSz=r!hG$KB6TMB>%GR;_?zL zaY4W5vgVxc{_QK;H;db#w{X{zxcPe;Z#!Ro?ewc^(k=^@x-@;mn46uaCqF$U6(bPz zeOH&Z!!N%VyfGYF>%Oiy_-Y>io)!9`qWxe0-xdq`A=y!tApA79;pxHN6?`Fu5AM!$ zymb8T>j%?<Ki-;ncWTe2vyV#t<ngE8t1flheEGf4T$cAABIk>|pQG%)KWWpg5&^yR z%;~>uTBfXJDn80nx1+PBqagauIir(b=Fc=_y&wNk;m5fzty_GKy<N{%Gk0eGa`j!z z*L>EdOgp)uwq_B3N4Ebr$%dj8R>^j&6mGp%Idxdnzw7a{wW=$v+V^#S+of_g^7^!( z1LwY++x}_!m%sn9cC8KWI%}?AV_;Y!&A=cEtMW@SQgc)F!K5K*sOwFnfBtPVk^THP z$`wlTlhw<l*h{-^Y@Ds@D}5xdT`se8+Kqrk4wpP@rEfiMU_87lv0d~=ZhK;Mqu7n5 zzfEph<;Y%Gzrj6d($R>xNoF^?*4n8E{e1lUznuKa)z{x0=aCgs)bzg<_qXV*v&zRx zhshh>-j(~NzqZmtYYxkdk1bpFYEHbjwfFn|-Cfs1U6Ps-YL%`R-tL~_b$$A@CH#jY z;+|=GCgh!Q?GfG6RiG*GgX6`qoXnkJ(Xm~IzBW@HNSwGQ)xXmJLcruNW<`%|+17VX zsydLKw7^q1z4ag)dui{4;G3yTvPsvzHTWOsZ+o)ulfdDigD$sM&z!$~OJOPJ`t<G@ zQ5&{xS(jgTb61XI!G@fQu7VJQwcn-JYDcryGACroU4Q*`*WN0}^9&o_H!lmDW^~rA ze(k-JnbRD;70xT$er=6uOiky%Q*+L&^;>>9Cv{WABDNHH>Ea(-Z)GJH-#jQi;i&oZ z554Eow(oxRfcM0s@aG@?YOlT8)@{C^<KmM9zO}Nq-lt#TJ~3nSTdyBd2d-yH*0eQc zXe{#HEb-Yzw`;4!{l|;=8F*77F0!o5^g4E<UE|%O3KflOJRhtME|s)?a(buYS#Hnx z3n5$g@(1%s8ZQ3+>gV_M^Xy%Z^UssD|MBw2!^4l+?W3H|t;(I#@-$R`|KB&~kIOH% zpI7(!(?#*C6_LmO|2nL`|IeRFQ-MqAYUjH3=l}Wk=4AJJyT5;TyNiDCZFEolG2xiY zqS;S4-hP%4F%BzKRaY+DCwiFU@~5t|DpM>D&hlpcJFiRT%UR8|ndLH;R!6ozkY#+j zn5&|;=)7QsKxgM7(K+RdZZN(pI%gFV5|!xwS!-sch3BEHXGuZF-X_;X9Npb5e0=|2 zPp(p9esA}Wc`h=B-zEG+_&l9NE*1Z2n85Y+UC^r=rPF^kdE14(&~ugb(e^c3qO3kE zt$SN0Pu<M0mpuQ<ayD19ms+g8RMjoB%i{#s8Lbw1p=UEbTFujt37;tQXvS3DI_A|Y zyiObp`H>pqul1qCL&AM;gGc1nXVVyJz5VVV4A`i3ci-t{8@oli<U%g)X|&W=-u}bU z{7_+3tLz08tz`QN>;j7<-WG<k-hZ0c_M-d)v!B?m-+ZclyZzU*ZO?qsVZtq7Yb+?b zZ;#&&;r-2;(#c6)ydtH`!;YK$+<oY*Mo8Z2hmW|-r<@Xxw_i8mcdqWu*&C`upPu>F z>9m)le3MQ2PL3D1<L7SwpmI}rg23nKE4v^4S!YrG|L3DU8uRt{shnSBAd+#n`-a=? zEq6a2T378ODZN5d*?dXn9@RItj`D0@t%@J19M#n6eGr&#WKdAB@IV@`P)w~$s7Xfw zmv8mm%PvkcC-SL!?QA)`NOIkhwI471(5}9c^UAqnt&Gs>Sh*VyMQ!KIH{~*NTXjJu zqu5P*)`7^GYhzA!H07;7yk=dimGSmnf@eis+oKn5GTUf5rTHZH)r1Wj)J#k{q7*lL z(wg(rG~41$f*VtA`Rm1ZbdPb1zf_WZ*pOm(sp|ai6?t=}96vQjDT-g3=huUZuB%>m zqGr93F0k&NeCmx9_l>9bs)hBR3fg=Y3b3Bpr=zK^?tLNfcDCvC*>ax>Z+$yf;MaEC zf_1(3UH2rhL#dP3xA6WBo_sUq=b71SV+_qXUln{!d~3ISLoj=g;#IL}6M1=!r?xV9 zo2YC5FA_de704A5vUs!S#!YMoKdkp;EGvoiK9>Aw^`_0<%D2kCnQAYsS=1rna!^oe zW9N<I59(xBE%0|xjVYhewIKdMo?rBBpPff7y?1@;opJDH;h9Su=}*I!XUsb(md6#O z*j%aI$-XP}RNfTp{aR{I^*yE*N&G(_a`89k9lJFwx0vl>c)5E@mTj@s3puioYpLMZ z-%L{M!PW{<59-91DXw_XbnB1!lYsx*Ez>h+Y-w{|a%;JwkjOV>QBJ3C3v_0_ux^d8 zy}et<bDE%$V5m^|vamOTmMS?bvLCFvciUFPamDPcNmCh{ttIQ<RZnwV9AIwww5C`q za^v?~GC!7E%g;WuU`lkmuk+;F`vo>;@Tcuw6`Mcpd3AoIpsiu{W~s_BhRv74E}Xjb z&(zi8d04jgwK>O5pN+Bj@MzC5g9sBB%{z*Uf7!p;Yt^LfFMGjTdgA-SZAw2X7Ir$` za0rRt)XRQ}^%7s-g2GMBW;0AQvu~{_y%cuZ{5AKx67GHKJ!@61yZ4{Ze6z^1_@?E~ ziiyY0>Abmdt54U&wBlPz*P#m*_9rBc&p4m*R6nw&gkNSa>wO+IA5USQ4|BZ|UMyr+ z<5&2p>%O_<u|C$SO?-iv9}^=3!x}~g1`$}*R-T#^j5LYUbK#&Dv!MXngWV6Tb#`vQ zIaO&>$##XHk4jCy7&6Psq|c-r^2u7`pMPtWLR{5pm6`oB>v#Tgh~LZHz4*e)w7At= zO-m29t_)tU|N3U{DwUZA68?-<TiF9w|NE3Q^ES`ybdQyPrmWI%jM_5y)-sXkwMX+9 zd(UV(Y4j@_&I<9GXChU1`{+#79<`mlpY`5;4ULUoo!@poFhgc;tMw-9mI-}-XB`xu z*pw4HMP7IQ;?zfqMv(%VzZck7X2kqRopscnV{WRD)a5-oi~WUPGKb!p{j0)hz1Ehu zhebNh9-Gcmyg8t@vp=cmg<Ya=&FLwu5%TMLzfYMj_uhNq|CBrR@D)~!Ou7t+v3ArI z#IPAN7>#8;aez0f0jLWZVH!d7E@lP>$jSonY65hPsH?j`8esSy8v_ISS}%0nsB3Ou zIze<BCz5X13LJFpH7Ls<K-yr~hzB}Z2yr^{v^7?5pig$g%wb@d#s@VC*$j|F?3r=R zQ>s99D>Q6rloEt$N13(cVo(69VPueCXh^9&As2Fm5l$~TAtE1gg%M;2avY=2ZGsH| z6J9VokY_s4wWCkRfOUe2Ya&o57Q-iJ&^4ovF@g1g2|F1S&1fS|=;oj|T){?w2}2dA sIp78?QZp9a2=oF1tQ$=5XhV%aD=q@OS=m4eco=vXbeS0#9CScD0Ln~HcmMzZ literal 0 HcmV?d00001 diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index 3d1c8e1..82579a3 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase): os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.cleaned.odt') + +class TestRsidRemoval(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + how_many_rsid = False + with zipfile.ZipFile('./tests/data/clean.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + how_many_rsid += num + self.assertEqual(how_many_rsid, 11) + + ret = p.remove_all() + self.assertTrue(ret) + + with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + self.assertEqual(num, 0) + + os.remove('./tests/data/clean.docx') + os.remove('./tests/data/clean.cleaned.docx') + -- GitLab