From e342671eadd3f5ff922fe62cae81792d4cd65e83 Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Sun, 30 Sep 2018 19:52:35 +0200 Subject: [PATCH] Remove dangling references in MS Office's [Content_types].xml --- libmat2/office.py | 34 ++++++++++++++++++++++++ tests/data/malformed_content_types.docx | Bin 0 -> 4131 bytes tests/test_corrupted_files.py | 8 ++++++ 3 files changed, 42 insertions(+) create mode 100644 tests/data/malformed_content_types.docx diff --git a/libmat2/office.py b/libmat2/office.py index bad352b..b220092 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser): return True + def __remove_content_type_members(self, full_path: str) -> bool: + """ The method will remove the dangling references + form the [Content_Types].xml file, since MS office doesn't like them + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError: # pragma: no cover + return False + + if len(namespace.items()) != 1: + return False # there should be only one namespace for Types + + removed_fnames = set() + with zipfile.ZipFile(self.filename) as zin: + for fname in [item.filename for item in zin.infolist()]: + if any(map(lambda r: r.search(fname), self.files_to_omit)): + removed_fnames.add(fname) + + root = tree.getroot() + for item in root.findall('{%s}Override' % namespace['']): + name = item.attrib['PartName'][1:] # remove the leading '/' + if name in removed_fnames: + root.remove(item) + + tree.write(full_path, xml_declaration=True) + + return True + def _specific_cleanup(self, full_path: str) -> bool: if os.stat(full_path).st_size == 0: # Don't process empty files return True + if full_path.endswith('/[Content_Types].xml'): + # this file contains references to files that we might + # remove, and MS Office doesn't like dangling references + if self.__remove_content_type_members(full_path) is False: + return False + if full_path.endswith('/word/document.xml'): # this file contains the revisions if self.__remove_revisions(full_path) is False: diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx new file mode 100644 index 0000000000000000000000000000000000000000..43ac7437618f8f49e52c2006526efa087cb0c011 GIT binary patch literal 4131 zcmWIWW@Zs#W?<l8cr(Y&cfHw~Cti#U4BV^?48jZy4AIW{c_pcNCGjDZ1*yfcdKI}j zTLVw~H9LsJo(`XIQ)<SmL#&Nu3$x!I+AG=aZ9TCv<(<gw{onfngtq2NMTa?@+xgse z=Z#lpZx1Zqv@EIoXV22^%AkGkWD>k)9PC=_T)_HV<N5;DXV)4Pmu!sdc9>sxbnXQC zB-TH5X4MZa&0<hXQOw#9w`7Cp?TTO?shMGyqPO^-SaP*BS4uUp;*I_v%VU2}PM#|5 zJ8kd&_Ar)J&Dl@g{=b`2zdYK^Kkt;d)S=Z<myYjqidmR+XlK~Te9wC`e}(ttyqNcY z!(LS;@9Bry(-(R+?-%T`nKa>Jef8JU*9-IbqIa!Xcy5mRB7UxaQo@y<8by=NDWA~K z(^}iUSx5P!*SzJ||8ChQrzUq?@!ZtseR?6$OpA=PC%JPp-s^jEbFtyc7DXv<r$>8R z4{N$rJd#je)N0he_>HEp`oRs^4^P;?+-T@9X^KtNBOA|s4B((=;Nak3$my~3Wk3Sl z3=9nMMX5Q(`g$O8?Id4sW=Ed3_kTs5-`&m8?l^Joo03z$(pUDDnQz`*ILB_Bx%U4( zN3LA~T?>xysr>vw=2Gste+&HPbay)*UX|%sXmD|tK~}|WTYdS2%lf;*?Iv9Dni-?D z>R!vDL?O@qj-x(rR+<<Vu9&Tu<{xz2CP``Y@`;6aw|$>cZ?s^(;nizdC!%*VZT;=> z$K0$+Y?DfhT3Xz)nDv`2^hEtCQ!XYfu3TC8Gs|QuPep9T-|B@HJm)?Ho4B8MTvz!q z(jsEp_Nf|6qut8?XPV^KoZfRh=VIKW?z4*y-<19`Kfs%v<HZF=<(mu)47N-RctS*g zfq@|<KRKW%zo1w@v7i8y66a3z^=38{XnX&!OZCew->6j0NxmPM{aLx!U5oBKSz^BF zr^3u>-|kCCM~e5~yO)0N;oOrg`u*GV`2;3^T&K2q(nO{?pKk5grndg<<r2rPmP?s8 ztJzI09hJNbk6ypX-jSgoYI$(!tdfq0(=IKs+F&&GX4%b~KQmO*dUU_;o*Ve##npRF z&Q|Gb78SS@NPRKkX?!xNqWZx9wygF0zI*4N)qG-eP<q3{=^Ra$<8M|>sZG}5n0A`^ z!)}49#VXIbJEs2Z_)sI~xO&pmP|NvidTz^af8U)IeOF|0TW;MUb&VxGk!CyZ&yMqd zKBw>TkEm@Ej`UpaejqLXb*6rkeTk7*>reY5rpuM3ETx<8dQaf`_oe4mrQej?DGPr~ zngz5yyzqz{6t*^TNBra%85sUC;tN|r<giW7FG>XkZf{Vm{~-gB+V9~ywfA=!8s6A2 zam7PVAz=ln`&reJSGQ*0I`rWF?cAINpB~({{XS>j@2fB0D4b5<Ki1`ys?nJw(vT!7 zzUst<DpP)SjSYQTey6=0+xikN>g=Cs<QtLt;An_ck8lEW{t}%;E9dCvCI@e9()Gwv zULP>;PKaC3^M#rlmc8i~Yj7{DX;-y4wQ}(eL!Pe>I{c>1;hk2Me(G9-F^{T8z_~{% zULPV~AMmLD>UcUvV`+gqZ}?JM%N6`#Jco9zTWpoPbl-oL{krp}zg@8~;Or`i1vOP! zEe6b282R@wzC88yO!luITV^e;5mBFPu#f+8#nd~Kzf9}7{9W_k*L{X>9-Vr^Tg_(o zEMcbLt)J3mPx@PqtA-vur*Fe9x^DW~6c5woIx9=L>%a6e{44k(7hY8B{%OkZ{~pOk zQt>@kq_<4vedGH1`==$kRtJ`4@BaOR<=L)@Y!8yvV>n}X@&EsG{Esv!g0`&|YE5Ef zV7SeMFM?zk7#Pa)i&FF<WkO1Ra%paAUI{3gAmzhEU#>$20<GWYDV@2uEL33PvOJdp z?XL`mbGIEd<2<QRRn6QLrSa&{(?7NE4fW&WfA_^+V9)Js(D7_s7`W}yGQk3k?Uz;8 zbbIbQ?iUopxwfqFsI)N47MHxEl>xfnj8v{I?3{e-S6q-z?cup0>NgeUFMZkc+n-H@ zS?zWx)1P&x8m5}J@86Ico82nC`kbwl)Wipkd9rODQ}#IueONL11f!jv{qN;9xx7<9 z-FSHW;OehukNjD0@33gb)92#z<}PpbJjeb}tQ-^^KlKd`9A#i&5XKuEp!5n34&;DX z8seLO+kj{9`*4X}>f5d_I?-dbmqofPLQvr*e?ZDjF||%6*W+jF{x12kI=fLZLVwvx z_kHe{7Rk?#>+w4$w3jJ*Q_b9sE>VnAH~F5dliR<}Oy}c~+@q!uk(;`|yvh9ia-+VM zz}$(`CLKtqWmJ8AK`d^|rGouNUJ4Z}GF1<V1YGwzadCqov-dTjol3sPmzY{)GKVos z`5io}CL2_?IP90@X}O>&vw~M#-+t<6=i5K}e>YzYj;Zg?;*YRoQQXVlY*2N<H0I_^ zwexuy7N5>Yz7u=ORr%lR(|e~f<{bvV%BK`dz79BMu~*ys!RgKBHSN=^cWyA1PyXF@ zVQ1&rj_6&cGWDf;erH<stAe5~KE3rLD5UQH##Q&uz4`y+(O<(wKkh|1)LlvxWC|2o zoqT%(xB3dbi@RPY_lFlR=$_^oKWC@j&Ehvvn@yA?m@gge<NU$*n){a2O}F`Nmlf@l z;<%)e_41}|{M7nw&&1{l*?jw%h1S)~tM7Dv+ikYTKIffT@B#k5hasnDC0)2%+v9k3 z%lbQEpS;>q&n|L)Ahv{Ufutp);0)FrE2)EluXY+AUwGvf%P%i3(|(n>E?Xww+AA@? zH*#|6z7=UJJABH2$Fawnhh7?Wl>Fj(`6>U_KSofxsBL!ta+8sP;Q<T2Y$629CTaP3 zB_WAPIpBh9ZGfZSVFQ6ZpSez#zuLmyvS^v8(4&^B1y32H(=SJCx<2u`%9rmu{InHM zEL?E!&D@(aZ$_@OpDSw1W;RdDa=vJgLT0?QRmi8>??wMLSlt&$tx;d|L@;t)c=_pt zIX13hnR9o@Xx%(^W2)E2yo-$IYucx5IL6LySbNx!*X~N}2dSBpcWurJ6yZPBs8~6T z^ZJP&%M7LF@BLTcQ7FQhp}35DhpFu8tCn*myqKmtz3*`rla@I$yWC;sf@qE&bwB=X zlUx-;?=h#p`DnyDb6(>)m%DM#Y+0DAyVX5<)Asvpci;ZK>{51N_WI>6Gq=sM>1IFt z$A5MB=HnAii$(J+JGn05vD!lMx$|pRuT1$IJ^h2Nq<}xGboDN~bMw2b>?^*qf&%y8 z@tYj7j0_B+%=iKqp6QBHOG+~H(u+Z*k87W|&>;gJ=kIfr9^0yHcyL+qZ67xSJIC6_ zb1G$OW+E20iARGjO?LbJZt9}Sq=^$x9xi%#K;f%RPs!&3JK1N^28$Zo1ZG}1EX}&B zY}<S($E#kQE4fw9K6$uqmUQ;g{mERL9`uB4)}Ai1<y6sT;pvLv?*f;E1#i)Qe$A|P z;)1o4<)5W2Q%in1@yg+IjembK#~oIFr5V$AYNyb70o$BA@5FEapZX145q7ZzhAd@Z zU^vf!&yW1D1XNs7nUf0k;n~n!|Jw!v|MtZT{;c+N>{yt^xKUwC-ip~8e;QU7d+NyC z{%kqr)9duzoZ7X5>o0v<+ByGo?7ZrEm%pBVS*twRdW*`IQ#bzIo1nU4hhEDnqq5>! zLyI&{%K%o7wFb%DDrHrlUY@RT$y1ytG=F`D$@z%4b0n8)ew314V4EB8Gh$^f^Evt4 zspq!p_FR?etoiA-jQ#M#z*`LgYxFnQ?+9EWcqi-9(%X-Bd6q_U^=+Kue_#ICcDHY= zQvZaMcyHXk##3X<?02R`Bv0_<h9wUd7cSk>I`hljD!VsFzDH%$^{H;T6lifPew%gK z@*g`coC-9OXDn1SD$`8MsI6Q(Ni!<YNPg9Mk+&R*6RedPC1+<WpRB3j_36F+q|A9S ziC4WgJQP!(zc%~fq~blytfCPIFMaqKz2#K=4(6E~n^j%-`wt}cKiSgf84%jPQ0k7) z{7scB*ng@x7zb9o+A7hbu4wpH%K!RPgV)PH%;<JHU#vgjYSGM`T~DsQe%W@Z@Wh|v zcMnxq^F*xBnPK&zb77(2mrv(Ty_np@_B+nX;@gkkO1tLUoqtikadP&`-1}Tx{QehD zUwlvh>l10G2u0~-Un-uMFdurf{8rkGL)F(GU;p=awVs&u+C8o&lc%zzN57e3aKJUE zrmc=K;X<-RhttaoziP`3Qr4T^x88a9x8A3?*Wo<2_mW<GtgflNQgn9)gPP*6HKuce z>X%M+dKfBwEp^UizK=fk@*72;&VNz2!}t4kvr^3yxh5h9bNAZFE}v?;<?R1hy}eVj z_|(pvV!JI@@ApUdvj1AcYyX$VRb5oieSCak*_pZ--@?!Qd&eny`u?1_Jq`C(U5^m0 z*uU_a&|ibKc8C7`>-GD`0Lq>jF0;7mm>3v>IYCL6k;$GJS9eVX+JjJN*wP4MV(Gkb zF(`m!7#JBO7#gMrw>^ot!q{N@u^CK5`2pUHOd<>-@Qw*`10B>c0UH1&su@wbDd-xJ zTc4ozJ6I2x=w-re&7<o^ZW@6KAFxI+@sb%yH>|yit{u540ct;ib%Kd5R{X6gbd!)P zA5@3TWP_T7+z>(6j$FKgYACQ9z{ExlsCJb43SB>P(S>R~9~V?Va*>R#9Xa`c$~Umh dVB!;8JG`_D@MdKL$#F7pG9)lDFs$VT@c=7*z_0)S literal 0 HcmV?d00001 diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 30039e6..5af0e81 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): os.remove('./tests/data/clean.py') +class TestCorruptedContentTypesOffice(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + self.assertIsNotNone(p) + self.assertFalse(p.remove_all()) + os.remove('./tests/data/clean.docx') + class TestCorruptedFiles(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') -- GitLab