From e342671eadd3f5ff922fe62cae81792d4cd65e83 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 30 Sep 2018 19:52:35 +0200
Subject: [PATCH] Remove dangling references in MS Office's [Content_types].xml

---
 libmat2/office.py                       |  34 ++++++++++++++++++++++++
 tests/data/malformed_content_types.docx | Bin 0 -> 4131 bytes
 tests/test_corrupted_files.py           |   8 ++++++
 3 files changed, 42 insertions(+)
 create mode 100644 tests/data/malformed_content_types.docx

diff --git a/libmat2/office.py b/libmat2/office.py
index bad352b..b220092 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -151,10 +151,44 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
 
         return True
 
+    def __remove_content_type_members(self, full_path: str) -> bool:
+        """ The method will remove the dangling references
+        form the [Content_Types].xml file, since MS office doesn't like them
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError:  # pragma: no cover
+            return False
+
+        if len(namespace.items()) != 1:
+            return False  # there should be only one namespace for Types
+
+        removed_fnames = set()
+        with zipfile.ZipFile(self.filename) as zin:
+            for fname in [item.filename for item in zin.infolist()]:
+                if any(map(lambda r: r.search(fname), self.files_to_omit)):
+                    removed_fnames.add(fname)
+
+        root = tree.getroot()
+        for item in root.findall('{%s}Override' % namespace['']):
+            name = item.attrib['PartName'][1:]  # remove the leading '/'
+            if name in removed_fnames:
+                root.remove(item)
+
+        tree.write(full_path, xml_declaration=True)
+
+        return True
+
     def _specific_cleanup(self, full_path: str) -> bool:
         if os.stat(full_path).st_size == 0:  # Don't process empty files
             return True
 
+        if full_path.endswith('/[Content_Types].xml'):
+            # this file contains references to files that we might
+            # remove, and MS Office doesn't like dangling references
+            if self.__remove_content_type_members(full_path) is False:
+                return False
+
         if full_path.endswith('/word/document.xml'):
             # this file contains the revisions
             if self.__remove_revisions(full_path) is False:
diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx
new file mode 100644
index 0000000000000000000000000000000000000000..43ac7437618f8f49e52c2006526efa087cb0c011
GIT binary patch
literal 4131
zcmWIWW@Zs#W?<l8cr(Y&cfHw~Cti#U4BV^?48jZy4AIW{c_pcNCGjDZ1*yfcdKI}j
zTLVw~H9LsJo(`XIQ)<SmL#&Nu3$x!I+AG=aZ9TCv<(<gw{onfngtq2NMTa?@+xgse
z=Z#lpZx1Zqv@EIoXV22^%AkGkWD>k)9PC=_T)_HV<N5;DXV)4Pmu!sdc9>sxbnXQC
zB-TH5X4MZa&0<hXQOw#9w`7Cp?TTO?shMGyqPO^-SaP*BS4uUp;*I_v%VU2}PM#|5
zJ8kd&_Ar)J&Dl@g{=b`2zdYK^Kkt;d)S=Z<myYjqidmR+XlK~Te9wC`e}(ttyqNcY
z!(LS;@9Bry(-(R+?-%T`nKa>Jef8JU*9-IbqIa!Xcy5mRB7UxaQo@y<8by=NDWA~K
z(^}iUSx5P!*SzJ||8ChQrzUq?@!ZtseR?6$OpA=PC%JPp-s^jEbFtyc7DXv<r$>8R
z4{N$rJd#je)N0he_>HEp`oRs^4^P;?+-T@9X^KtNBOA|s4B((=;Nak3$my~3Wk3Sl
z3=9nMMX5Q(`g$O8?Id4sW=Ed3_kTs5-`&m8?l^Joo03z$(pUDDnQz`*ILB_Bx%U4(
zN3LA~T?>xysr>vw=2Gste+&HPbay)*UX|%sXmD|tK~}|WTYdS2%lf;*?Iv9Dni-?D
z>R!vDL?O@qj-x(rR+<<Vu9&Tu<{xz2CP``Y@`;6aw|$>cZ?s^(;nizdC!%*VZT;=>
z$K0$+Y?DfhT3Xz)nDv`2^hEtCQ!XYfu3TC8Gs|QuPep9T-|B@HJm)?Ho4B8MTvz!q
z(jsEp_Nf|6qut8?XPV^KoZfRh=VIKW?z4*y-<19`Kfs%v<HZF=<(mu)47N-RctS*g
zfq@|<KRKW%zo1w@v7i8y66a3z^=38{XnX&!OZCew->6j0NxmPM{aLx!U5oBKSz^BF
zr^3u>-|kCCM~e5~yO)0N;oOrg`u*GV`2;3^T&K2q(nO{?pKk5grndg<<r2rPmP?s8
ztJzI09hJNbk6ypX-jSgoYI$(!tdfq0(=IKs+F&&GX4%b~KQmO*dUU_;o*Ve##npRF
z&Q|Gb78SS@NPRKkX?!xNqWZx9wygF0zI*4N)qG-eP<q3{=^Ra$<8M|>sZG}5n0A`^
z!)}49#VXIbJEs2Z_)sI~xO&pmP|NvidTz^af8U)IeOF|0TW;MUb&VxGk!CyZ&yMqd
zKBw>TkEm@Ej`UpaejqLXb*6rkeTk7*>reY5rpuM3ETx<8dQaf`_oe4mrQej?DGPr~
zngz5yyzqz{6t*^TNBra%85sUC;tN|r<giW7FG>XkZf{Vm{~-gB+V9~ywfA=!8s6A2
zam7PVAz=ln`&reJSGQ*0I`rWF?cAINpB~({{XS>j@2fB0D4b5<Ki1`ys?nJw(vT!7
zzUst<DpP)SjSYQTey6=0+xikN>g=Cs<QtLt;An_ck8lEW{t}%;E9dCvCI@e9()Gwv
zULP>;PKaC3^M#rlmc8i~Yj7{DX;-y4wQ}(eL!Pe>I{c>1;hk2Me(G9-F^{T8z_~{%
zULPV~AMmLD>UcUvV`+gqZ}?JM%N6`#Jco9zTWpoPbl-oL{krp}zg@8~;Or`i1vOP!
zEe6b282R@wzC88yO!luITV^e;5mBFPu#f+8#nd~Kzf9}7{9W_k*L{X>9-Vr^Tg_(o
zEMcbLt)J3mPx@PqtA-vur*Fe9x^DW~6c5woIx9=L>%a6e{44k(7hY8B{%OkZ{~pOk
zQt>@kq_<4vedGH1`==$kRtJ`4@BaOR<=L)@Y!8yvV>n}X@&EsG{Esv!g0`&|YE5Ef
zV7SeMFM?zk7#Pa)i&FF<WkO1Ra%paAUI{3gAmzhEU#>$20<GWYDV@2uEL33PvOJdp
z?XL`mbGIEd<2<QRRn6QLrSa&{(?7NE4fW&WfA_^+V9)Js(D7_s7`W}yGQk3k?Uz;8
zbbIbQ?iUopxwfqFsI)N47MHxEl>xfnj8v{I?3{e-S6q-z?cup0>NgeUFMZkc+n-H@
zS?zWx)1P&x8m5}J@86Ico82nC`kbwl)Wipkd9rODQ}#IueONL11f!jv{qN;9xx7<9
z-FSHW;OehukNjD0@33gb)92#z<}PpbJjeb}tQ-^^KlKd`9A#i&5XKuEp!5n34&;DX
z8seLO+kj{9`*4X}>f5d_I?-dbmqofPLQvr*e?ZDjF||%6*W+jF{x12kI=fLZLVwvx
z_kHe{7Rk?#>+w4$w3jJ*Q_b9sE>VnAH~F5dliR<}Oy}c~+@q!uk(;`|yvh9ia-+VM
zz}$(`CLKtqWmJ8AK`d^|rGouNUJ4Z}GF1<V1YGwzadCqov-dTjol3sPmzY{)GKVos
z`5io}CL2_?IP90@X}O>&vw~M#-+t<6=i5K}e>YzYj;Zg?;*YRoQQXVlY*2N<H0I_^
zwexuy7N5>Yz7u=ORr%lR(|e~f<{bvV%BK`dz79BMu~*ys!RgKBHSN=^cWyA1PyXF@
zVQ1&rj_6&cGWDf;erH<stAe5~KE3rLD5UQH##Q&uz4`y+(O<(wKkh|1)LlvxWC|2o
zoqT%(xB3dbi@RPY_lFlR=$_^oKWC@j&Ehvvn@yA?m@gge<NU$*n){a2O}F`Nmlf@l
z;<%)e_41}|{M7nw&&1{l*?jw%h1S)~tM7Dv+ikYTKIffT@B#k5hasnDC0)2%+v9k3
z%lbQEpS;>q&n|L)Ahv{Ufutp);0)FrE2)EluXY+AUwGvf%P%i3(|(n>E?Xww+AA@?
zH*#|6z7=UJJABH2$Fawnhh7?Wl>Fj(`6>U_KSofxsBL!ta+8sP;Q<T2Y$629CTaP3
zB_WAPIpBh9ZGfZSVFQ6ZpSez#zuLmyvS^v8(4&^B1y32H(=SJCx<2u`%9rmu{InHM
zEL?E!&D@(aZ$_@OpDSw1W;RdDa=vJgLT0?QRmi8>??wMLSlt&$tx;d|L@;t)c=_pt
zIX13hnR9o@Xx%(^W2)E2yo-$IYucx5IL6LySbNx!*X~N}2dSBpcWurJ6yZPBs8~6T
z^ZJP&%M7LF@BLTcQ7FQhp}35DhpFu8tCn*myqKmtz3*`rla@I$yWC;sf@qE&bwB=X
zlUx-;?=h#p`DnyDb6(>)m%DM#Y+0DAyVX5<)Asvpci;ZK>{51N_WI>6Gq=sM>1IFt
z$A5MB=HnAii$(J+JGn05vD!lMx$|pRuT1$IJ^h2Nq<}xGboDN~bMw2b>?^*qf&%y8
z@tYj7j0_B+%=iKqp6QBHOG+~H(u+Z*k87W|&>;gJ=kIfr9^0yHcyL+qZ67xSJIC6_
zb1G$OW+E20iARGjO?LbJZt9}Sq=^$x9xi%#K;f%RPs!&3JK1N^28$Zo1ZG}1EX}&B
zY}<S($E#kQE4fw9K6$uqmUQ;g{mERL9`uB4)}Ai1<y6sT;pvLv?*f;E1#i)Qe$A|P
z;)1o4<)5W2Q%in1@yg+IjembK#~oIFr5V$AYNyb70o$BA@5FEapZX145q7ZzhAd@Z
zU^vf!&yW1D1XNs7nUf0k;n~n!|Jw!v|MtZT{;c+N>{yt^xKUwC-ip~8e;QU7d+NyC
z{%kqr)9duzoZ7X5>o0v<+ByGo?7ZrEm%pBVS*twRdW*`IQ#bzIo1nU4hhEDnqq5>!
zLyI&{%K%o7wFb%DDrHrlUY@RT$y1ytG=F`D$@z%4b0n8)ew314V4EB8Gh$^f^Evt4
zspq!p_FR?etoiA-jQ#M#z*`LgYxFnQ?+9EWcqi-9(%X-Bd6q_U^=+Kue_#ICcDHY=
zQvZaMcyHXk##3X<?02R`Bv0_<h9wUd7cSk>I`hljD!VsFzDH%$^{H;T6lifPew%gK
z@*g`coC-9OXDn1SD$`8MsI6Q(Ni!<YNPg9Mk+&R*6RedPC1+<WpRB3j_36F+q|A9S
ziC4WgJQP!(zc%~fq~blytfCPIFMaqKz2#K=4(6E~n^j%-`wt}cKiSgf84%jPQ0k7)
z{7scB*ng@x7zb9o+A7hbu4wpH%K!RPgV)PH%;<JHU#vgjYSGM`T~DsQe%W@Z@Wh|v
zcMnxq^F*xBnPK&zb77(2mrv(Ty_np@_B+nX;@gkkO1tLUoqtikadP&`-1}Tx{QehD
zUwlvh>l10G2u0~-Un-uMFdurf{8rkGL)F(GU;p=awVs&u+C8o&lc%zzN57e3aKJUE
zrmc=K;X<-RhttaoziP`3Qr4T^x88a9x8A3?*Wo<2_mW<GtgflNQgn9)gPP*6HKuce
z>X%M+dKfBwEp^UizK=fk@*72;&VNz2!}t4kvr^3yxh5h9bNAZFE}v?;<?R1hy}eVj
z_|(pvV!JI@@ApUdvj1AcYyX$VRb5oieSCak*_pZ--@?!Qd&eny`u?1_Jq`C(U5^m0
z*uU_a&|ibKc8C7`>-GD`0Lq>jF0;7mm>3v>IYCL6k;$GJS9eVX+JjJN*wP4MV(Gkb
zF(`m!7#JBO7#gMrw>^ot!q{N@u^CK5`2pUHOd<>-@Qw*`10B>c0UH1&su@wbDd-xJ
zTc4ozJ6I2x=w-re&7<o^ZW@6KAFxI+@sb%yH>|yit{u540ct;ib%Kd5R{X6gbd!)P
zA5@3TWP_T7+z>(6j$FKgYACQ9z{ExlsCJb43SB>P(S>R~9~V?Va*>R#9Xa`c$~Umh
dVB!;8JG`_D@MdKL$#F7pG9)lDFs$VT@c=7*z_0)S

literal 0
HcmV?d00001

diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 30039e6..5af0e81 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -80,6 +80,14 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
         os.remove('./tests/data/clean.py')
 
 
+class TestCorruptedContentTypesOffice(unittest.TestCase):
+    def test_office(self):
+        shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
+        p = office.MSOfficeParser('./tests/data/clean.docx')
+        self.assertIsNotNone(p)
+        self.assertFalse(p.remove_all())
+        os.remove('./tests/data/clean.docx')
+
 class TestCorruptedFiles(unittest.TestCase):
     def test_pdf(self):
         shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
-- 
GitLab