From c67bbafb2c60782096af4f6225d94e18225d2ecf Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Mon, 1 Oct 2018 22:26:35 +0200
Subject: [PATCH] Use [Content_Types].xml to improve MS Office coverage

---
 libmat2/archive.py                       |   4 +-
 libmat2/office.py                        |  98 +++++++++++++++++------
 tests/data/broken_xml_content_types.docx | Bin 0 -> 4145 bytes
 tests/data/malformed_content_types.docx  | Bin 4131 -> 4135 bytes
 tests/data/no_content_types.docx         | Bin 0 -> 3651 bytes
 tests/test_corrupted_files.py            |  16 +++-
 6 files changed, 90 insertions(+), 28 deletions(-)
 create mode 100644 tests/data/broken_xml_content_types.docx
 create mode 100644 tests/data/no_content_types.docx

diff --git a/libmat2/archive.py b/libmat2/archive.py
index d812531..b29d690 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
     """ Office files (.docx, .odt, …) are zipped files. """
     # Those are the files that have a format that _isn't_
     # supported by MAT2, but that we want to keep anyway.
-    files_to_keep = set()  # type: Set[str]
+    files_to_keep = set()  # type: Set[Pattern]
 
     # Those are the files that we _do not_ want to keep,
     # no matter if they are supported or not.
@@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                     abort = True
                     continue
 
-                if item.filename in self.files_to_keep:
+                if any(map(lambda r: r.search(item.filename), self.files_to_keep)):
                     # those files aren't supported, but we want to add them anyway
                     pass
                 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
diff --git a/libmat2/office.py b/libmat2/office.py
index 91bf2a6..3abf108 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
         'application/vnd.openxmlformats-officedocument.presentationml.presentation'
     }
-    files_to_keep = {
-        '[Content_Types].xml',
-        '_rels/.rels',
-        'word/_rels/document.xml.rels',
-        'word/document.xml',
-        'word/fontTable.xml',
-        'word/settings.xml',
-        'word/styles.xml',
-        'docProps/app.xml',
-        'docProps/core.xml',
+    content_types_to_keep = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml',  # /word/endnotes.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml',  # /word/footnotes.xml
+        'application/vnd.openxmlformats-officedocument.extended-properties+xml',  # /docProps/app.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml',  # /word/document.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml',  # /word/fontTable.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',  # /word/footer.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml',  # /word/header.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml',  # /word/styles.xml
+        'application/vnd.openxmlformats-package.core-properties+xml',  # /docProps/core.xml
+
+        # Do we want to keep the following ones?
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
+
+        # See https://0xacab.org/jvoisin/mat2/issues/71
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml',  # /word/numbering.xml
+    }
+    files_to_keep = set(map(re.compile, {  # type: ignore
+        r'^\[Content_Types\]\.xml$',
+        r'^_rels/\.rels$',
+        r'^word/_rels/document\.xml\.rels$',
+        r'^word/_rels/footer[0-9]*\.xml\.rels$',
+        r'^word/_rels/header[0-9]*\.xml\.rels$',
 
         # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
-        'word/stylesWithEffects.xml',
-    }
+        r'^word/stylesWithEffects\.xml$',
+    }))
     files_to_omit = set(map(re.compile, {  # type: ignore
-        'word/webSettings.xml',
-        'word/theme',
+        r'^customXml/',
+        r'webSettings\.xml$',
+        r'^docProps/custom\.xml$',
+        r'^word/printerSettings/',
+        r'^word/theme',
+
+        # we have a whitelist in self.files_to_keep,
+        # so we can trash everything else
+        r'^word/_rels/',
     }))
 
+    def __init__(self, filename):
+        super().__init__(filename)
+        if self.__fill_files_to_keep_via_content_types() is False:
+            raise ValueError
+
+    def __fill_files_to_keep_via_content_types(self) -> bool:
+        """ There is a suer-handy `[Content_Types].xml` file
+        in MS Office archives, describing what each other file contains.
+        The self.content_types_to_keep member contains a type whitelist,
+        so we're using it to fill the self.files_to_keep one.
+        """
+        with zipfile.ZipFile(self.filename) as zin:
+            if '[Content_Types].xml' not in zin.namelist():
+                return False
+            xml_data = zin.read('[Content_Types].xml')
+
+        self.content_types = dict()  # type: Dict[str, str]
+        try:
+            tree = ET.fromstring(xml_data)
+        except ET.ParseError:
+            return False
+        for c in tree:
+            if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
+                continue
+            elif c.attrib['ContentType'] in self.content_types_to_keep:
+                fname = c.attrib['PartName'][1:]  # remove leading `/`
+                re_fname = re.compile('^' + re.escape(fname) + '$')
+                self.files_to_keep.add(re_fname)  # type: ignore
+        return True
+
     @staticmethod
     def __remove_rsid(full_path: str) -> bool:
         """ The method will remove "revision session ID".  We're '}rsid'
@@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
         'application/vnd.oasis.opendocument.formula',
         'application/vnd.oasis.opendocument.image',
     }
-    files_to_keep = {
-        'META-INF/manifest.xml',
-        'content.xml',
-        'manifest.rdf',
-        'mimetype',
-        'settings.xml',
-        'styles.xml',
-    }
+    files_to_keep = set(map(re.compile, {  # type: ignore
+        r'^META-INF/manifest\.xml$',
+        r'^content\.xml$',
+        r'^manifest\.rdf$',
+        r'^mimetype$',
+        r'^settings\.xml$',
+        r'^styles\.xml$',
+    }))
     files_to_omit = set(map(re.compile, {  # type: ignore
         r'^meta\.xml$',
-        '^Configurations2/',
-        '^Thumbnails/',
+        r'^Configurations2/',
+        r'^Thumbnails/',
     }))
 
     @staticmethod
diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx
new file mode 100644
index 0000000000000000000000000000000000000000..41e0e49e9fbb843ce24f4b4538adf6e8991b3e3c
GIT binary patch
literal 4145
zcmaJ^2|UyP8{f=LIT~rJ961Xc;fF$wa%Dz@oaM+hckZ(!$HFkI+_&5;u?|-v$6O)g
z)*(m8eg3z8M}EEjzn;Cm`|i8#*>|7!^ZvY__k+{~QG!7L04<>OsiM}PvX`hK5CGt%
z0sxKy003iUXD1J9Cl7Ny9~WzPQ&De}<5QjIh$c8V^{s)5na)&WJu}||;Jk;0>YG<~
zBD--5Ve8*&`R$myud_*LPR)GqGb=rAT(WZ1KXyQ3CNBl!FV^+-bLzd=Gd4=f17d{8
z-ul4|Bf%2TJvE1yHY^Hzk+N^b-_w6taP(BIm1Mapd~1qJPTd+>xts@)rWdJw)=*tB
zaowOA#rI(H>G;xT+KqEeiOv!l>)WTnQ5*urxzM%e*S1ryTcw%yYa50pQLOC9E7@mq
zcgXN<3GgkTH*aQhR3?pEmAMAbXnvYL)7PqK&U_SqRa@zxxW9!>`2%ZpjnE}9aXKT0
zHIQL?iM8hvhH#Z@&r8=yE`<+A4a@N%jC##ABbN=25>09rbM98igmQJ#&w^fu=bs5&
zw{;WGj2Pfy@zZyC-%P9RXsIF*|EV0i*ZLZv^g&44O+aqlIv7`+m}jXGS^TC>!XcL;
zwDJ-Ed7w5Yl3WM?EiEm;5iPGp!v3FNAiXkovvzcciISg7;<PkDii~03_SlpcM;*_F
zw-VQRl$?3iXhL!Gi`_(;^L8Z%V~X?yWMoFm3!m>xCwo!E{R~7(RFq=V6FWs$cn<+?
z?>!<1<Fx355e(#G-)Z1X&OR9n&AQ2|9uwYtb-qwS%(dV#w~e-L%f(wfRrwg#(dv!l
zAn}ZJvHk)3R^#UsPj}RI|B&)!ujCEow=v5zDXWqeWkdK_^;l&46uNHNOJEtjO>Ou5
zGNl=bOV=b+yA(@(mW`whtDj*7bB(Wf9oR`Y`?tSnaqKahit5a28iKB-BQ<GgzjOo7
z43X9=M*#r*HzLfWZ&*27BHf%_++jCeT*$mk#gJeTV-EYaAIY~mtaZa$C{Ak`q)o+8
zI$#{}*5i-LEiPQ*`Z(0s=y=T7*zK{&)VHCqm}=Nj=J@4O{;D_(Mas%>O*Ma6XP<{+
zWN5G5kROetj3SSwYx7_aO}H%=n@mIQBaiUO#NHg)a&hdC=g`oWEuRfqWbJvX&eE6u
zvAfE$w~Mn}R9qmd5{w};ao&FQ2VwSQuQoKDJB4O0HbBcWlW6bunGbm<1Xv2wCbol?
zo-_Mq^L~m7$8Lo$`JYxSio@#5q!*({IG=rsvNs+DXNREz8wCV&&_+_V<B!d>Kc_@b
zZ{DbmeU0vmnt*by;b3<aJj6BbZz;T%%s&H>f!-U{jHTaSMSt}{B%l&9cTPwl!zR0@
z7|7UOG<%IWP1@Z1q~pJ_WjTzkrL&tg8Qh1urrM2gaKMIvaKJc0OzeF*reIQ?^%xgq
z+}`g*|5JzI#)<I}lw-!q#E9HRO4?5U>^xVy1=E{I4Qs)OTj1bZY{#FpcKb>)2?&-)
zpGCB5D27E_^a$7C#I+2qCz|yj=wlWj=N#divdYGvB^usWim2J2DMO}x(7U4hIa8=S
zZ$64WSk=}40iSeRVRnre<Jv?xA~EGiqVMgtf#CCud}>JI6tBjT(OkWn-<o2(iD0ga
z>JfunIhg{c>x_+r(rj5&?yCdJI+3)bg@R0E=M#<$e_#7hIOrXa=>>4MZLPy$dlUaC
z+aD|t4}Znf=Z*akznX~d+Ys7cdnGnM)i!g)k6Qkd1&(ES3+g!&6WYS3-%NyEq+u&f
zDzQ?N%oi@a!w|Ik5U}sEdfLD(Ky@Wy=RnO;9Ab`s2gPHL%wPV#v63SqTc79fd}ovL
z69Gd#VJTojXG&l?*lpQ`k_%K_#Crc0sW>CQCI3lMfxMjEtYCj?0%;%IL6IbyT&6$u
z0i#9V2xq>(k;c<8mZ#5*$#YV1IkyH7ORa8@qI)ao>j#RwAvo3eaW~+b7|h&!C)%`|
z2K6vlSUn_Dr@A+f#YOO0A762l`l}X%t_fX<XGk;j7$siCso4i9vM$a$kQos_ylti{
z9MF`iCosg7o;!PYN1GZ9;vdnc*ez`f#{Ti3t{i3Ra35Mkl!HJp6CqBg!qgI8DY7mV
z#J2+FMHP1P{ZU7-EAJ;q8j99BU+<PFT*^xR`1yERYW{t7BF!Yb7a5K%7`(ojwAf?6
zg@er3A8;H7B3Dn#c?8b*@|yt%L7;jts}(K#k`n4^$ig+mgtQuB=Z~Pc+|n7im$O;q
z5W;N;%PUlUrP`atnQn$g5LsVR7+3nIR;b(nVk@=Y2A-}fl@eZljcS%OG^&hRowwVW
zt$>|nPQ@g~)msDr`R2OW&G5Z0b>bRa-UW7i^<d<nMr%*G7)Wz~wU$S#B}Y=)4s;y^
zK{Pb;pVIZrzP>Hfep)x-(Y1oXXKh;%3%jtrs-9~mK~eThhBB1gFPZMaeY+)1hH(5{
zPPWo39VZsqKhpagXsmow@&whux4jbFPs}0TNWVO%IniGAhyR1b3$^8voR&Lb-L(;&
z;l>2XlR<Yx5gqqozPdMhJ`Qi{>IEKDJQ*X-A8bzTiDhk$8D0wPy~#qM!&+oHQqCYy
zAlgHivy3rt&xlG?H&3Y*9de()Q6<5{0qSjzrrSI^$1n^Tx{@B&$1TreMh~$RbxN#Q
zxxfAbb1&B6XdQ^P)ITjKLU|!d>V<;iqSUo|rszq%_D8q6M+4A`{rIvE*H<*ctUI%m
zC)jhSGfv0=S(2$7Wg!hZUu(~|WWF1w+}5C%jNvtll%voJcxSRxK}RpL01oqPYSXTH
zGcD9OD<1B#%{aT_JiHGi^CIA$>go__n@mvtPRCgP=7kN(!sy++<@iI$N|1^q_rm<*
zJAJ#?S3FH<RvsJcRH$#pN1*ZTKEujE%pmXTMh)T|cWY+G*nH{`Zpf%qA(c&zS}N_V
zOgft`mz_COR&OO>!|gzjN;LygEKoedVpMA2)t>2i@iMzzYR$>BLvP+=H7cBXfL;C%
z63XAu(1-;zDISr3XSxKz#S^ORb-+w*A>2NRbc3y%d18?Cm-{Yiu3$P_?mUJX$y4q9
zGN~LI6iG_o(8}!4ldm6pUBYD;)1n0sOx1Chx%J0Dx969|kKocmh$^FIpX4Y(eo+Ey
z4{hqMK2v?R;o0lp>QI)if~$UXF^Z;XSG&ldswK9a-Iy`&ZK=gHf9CPj^njv5tMA51
zOL8ZewW*+f1bJe5q^yGX8WkDbhL$1PQzUTpNuKccP2$I<bGP>JuyeZYPFA1G(VDD{
za7N{g6rO20-tvh)?uBRu01a(P2$9#5UkWTOce7cyH~z}bA~wtC76$XS$!)TpYwaT1
z<GYLesZYl6tdKBfTz3<cir`tD4pHpah$v*>?VOn`eFSyLt+S-BoIvYUol64a+uW*-
zC2=2L)XBMi4S(+QfYg0VMoB#9C#yVu%UR63CSu6m7RapW%vT|k=(bwcE@nB$4~xe~
z4zTM#8et@*j$ST_9$lp4zt_n0H-X$ed>pNRtgurbr9A>?-hXAzvgN0)7@lbltl+{s
z6+9N)4K8w57d|=iT_#~=?)G!Ka{(-6z3aIV>EBJ${L=c?+GhjK#9zSk;@jTukHzv8
z)QE;Y5%+Wt5R<l{lR;9cmB1|-cs+erX50N$oVYQp>1DPOU4{!OCvt_BA<zst6mrY3
z5CtM~qOioLBIy2;5&m0O@@SeSb%ukH#jvWN8l3``5B9yeBhv)+J4W=;6$#qooNu08
zS*L>Rv+^8yKQh4RF9$+&go2$|-j?S~X1nI%@8ecSedXt0Z``mAjON4l>PQcpS6}eV
z->m6w(-G$cx^jzq3fb5O_>{y6-5|a3qzk-2%N={+4DiHb+x&PTL6RgX#Mz~p-0asV
zpJW$EFL9WRbAJJ%Vl!;$UD`6nx0%;~a25CXRG4DwZ^q2vqt%i6F`1ALSJNwf3TU=?
zFP+!%{`!;yEx;|d0MQ=&2%pPeN{&+Ma)-tCyWt!OGyQY3VZE-cyDg)Q-WM1R3xtzp
zmm)G<SyoqwZC~Q=Qtz0_O0RG3@DS4FyS@Zf#5)wC#_92h1NWrtG1%G+RLPJVnz!mb
zBLQlh${)5#ZuA?R9^Bt65@o+o^5U{YJeKmd@q7Zj{<5QgSRl}%+ma((X|{Vizzc3w
zCOLkgwrNLn#ca-iQEu$kmt{YHpLcGf$pC)t?PAGP-Jo2o(xg6gz&fSx=<-zs&I-1V
z>0bhCv^Jhe-4SX<Nq`$rFE5_T$4cTm4@^a0V(pLeceGKDoDM?liu7rhhz%U%n)&t!
zpr%_eo*jY7>joYBi*#)5<0)n@g2$c=8nSuUWe%|J!As;D_xB$n_DNKwp|w?cL?1}%
zhu7%H)CE!~fc|TE<0a9T99+Tp5c2u|jB@k<F7hV;Ab<l9{18yqV)PCO7&(zaKL349
zz6lucb1*q<(8+_z&xjzY#-TCg*Tlo_lkDGr#$(cYes%M|W*@dj<c|L{Hb_GCPxjw_
z_1E;nR)XxGenv8B(*2#A`t_F%H=lq1k$md^V?=&UKP<204)rtYX#Yz8SHJo-|FCrZ
v`}Qa4|H?ls$-kx_rVm-)4h5;0e=7PvYKPRMA}t93pd&pjNHuzXgnacsSogw4

literal 0
HcmV?d00001

diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx
index 43ac7437618f8f49e52c2006526efa087cb0c011..cc5caf3515b228391273bd7f0ec615ab0bad915a 100644
GIT binary patch
delta 505
zcmZ3iuw0=&z?+#xgqeYXgW>yDN8io!9GChrGBEJ5GB5};Ffc?r=jWBA=9R>UR2HNb
z$Ldw&=4=hz?RVHfWbVmum$<-3hYkp+zGWzWd7!VS_;S<{{-3)R{;gi;DaH9K!1cCf
z=eeEFrxtCyn0u?yJ=OQ9+$T=&^vg?XZd)CgV$dua-THv5{#@WX2ktX#nVmh7_lh;z
z{heg3Xn$1b$K#YwN0yp0`kZjOlCak!QG46BWfCSvYnN=#m_5OBHTP{3uR|Yh#QrGm
z`8`=&`}!=M>U#dQ93kxCr+WY0)%@?bJ!jUPsq0Nz!cCW)uWjAobhM>-?Iin2dyK!%
zm$-Ss_J3-%Kg%?|7XIS;g`Um(g?nr!P54+}{k8P<!aP3dU27Jeo1?ynpX*z1=N<(S
zOV8(mPsGiqUOOB)ZDNJ$`OB~CBIEgm`yUHDKUM55zDk!daHi@><ras0_fKyw%y`nK
zDCO<+XmRUdO}8J%4BcF~Q~BH_HGg|PGmA0iY0Kqyuk-Hjm7KKX$wlGio=l&3CvW7C
z+B}s}g@+d<1|}clH4$TQy~TJU@(Lp=Jz0QHhp~RLEuRJB#K~oRwv2BlZ{f3J>SLX}
pj8$&35Wg1V+{w25c8psm*YMjj3QgVvWPh9dn_r1-BQG#g830g$*INJp

delta 502
zcmV<S0SW%6Afq6EP)h>@6axSN2mtJyK25JStmH}o000XG000vJ003J<Z*FvDZggK%
zd2nTOT`qWSY_(C#PJ=)cUCURH*&3MY!UchLp=<5Jy%~o~JCS*0?iAa<_l{5!wQU+(
zRzS|V=S8{L>Nf4bqq3%AclnH^i+NJM?i^rBn8AvzLvRIu=P1{p1?JX)A*8Whi$I@#
z#hs8JVg>wuHhbXGngAwbA!@K*q_7p+cvB1-nO4$Uv`xsQ)rD;uC1H5%FZ?;i{mGM+
z8%>tIzlT-?s)K9gM*r@Vf2UhEPj1Q=8p5j@(#O6+T%lsZxmL+<N#2?ISBz}vp8v4D
zB?3#A!iQ&np-F?k5sW;Nknw+e^>yo^ZVp?ztf9`FC!!Au{u&c`NhorX&LhY#ZYr&Z
zvn(U=N}i|J{j|OwCLYHj&XwnmE>v3rqB1LzM+kx5j^x>+G0B7>8cRary@kUmMtI^F
zBcg>ehokH%6DPs2YvIU0>9H|Dl9W7p;yg*d08mSl6bcx#nE@mW4eXpgO|Ler<Vpbm
z01K1K4Kx;jms5u1SkwW4J@JF&SkwW4ms5u1Sd$bEECG9yK@K<pjgxu~Jpt*HxDGx7
siUpIZ1s;<Y4=MqflR*zY0kM;S4?h78lfVx?0rHdo4<ZJw4FCWD0G^HBGynhq

diff --git a/tests/data/no_content_types.docx b/tests/data/no_content_types.docx
new file mode 100644
index 0000000000000000000000000000000000000000..d0e0330fd236d7752b4c3660f70c5c1e33b601ab
GIT binary patch
literal 3651
zcmaJ^2{=@38^&O4g$ZS*Vq`6uA%rX`Wy=^ML?|iS*t3tN5SeH&mcrQA?2|2JOGL&_
zL$=Z)OUOPN|4iSKy8f@{T<2Wpoa@}@ocDg7_kQjdsS9A_p<`iTp>x8h>e1~AcIvs6
zyRDN4Op10aj@8ozsB?sW{my%KVa(}RL_2X!=!}cdDljy8ZlQ+=JYiqFGp^27hrE~G
zR`2_5@n9c{NJ&SeL`A8$6xyr1!F%iA4nCtQFahg+Se=RLg8`jni=&0(p_x}f+A$HW
z7v>6NrQPx$L2u|Awkcl|s?5W>ja98D1<9o2r3Z%`+RdLbJRxgspOp9Gs}K$qy<wSa
zQCg`W#f$L0-fNxZTi~|oAdBPhxnlR-KSP0oxOh=kvs=Bycgak_w5l2>o@0LAd&geZ
zC7`pu&8gS&T~t?Q3kkA<N9qDuKKIZcAyLPx!azrd{7*)>s83vXu|c}KxO%{@y1GjF
zpqx^%R9d9D!oO1PiL8w1UAC2o)msASGqINpnMc0zJXx_Rn4GXS0Wmk*A2U9Fb9^fG
zRVXZ`3bvOkZmC4HG8W5_vOH2#C0g3m@2P$-w9lU850q0>7t(cW9qt82*a`9~Hsw6@
zjF?L3%T_9r!I8X3q)j`K8<^v(Pg4yRKM#)IKC5)|ai*(=D|khgBV;<($G>qW+@bXO
zx~@x?#I$k~q%0$m<#xXn$tNCdBgvA`30Qo}<(DP=F)9ML8L=2}SiL9~XRL^Sj2RWE
z{u<?AKE{(3jtXoR6VJw&$=6Oivef^S68(PTa@B(unEt3qh`?$x?6#Vxj8535+6%e7
zBVa|yoiW`9Y?Kwu7hgm?Dn5hEFOLkL>Uqab<5t=71>!J09UX<b|2MbXyScS-akr(B
zd*ASielwg0y>23jo~V<Sep80cpV9{H69i8<`121waU5x$oESwpr7us8s;sA^kq2ky
z1Us!cU*6NP6_32e6MT(#e_?x%pB$%{cv<vOM5m5=c(iq|WJ9uyo~i9*s}UHp&l=#8
zEqPVxtobL|rZ*MGwH%I=BGcX)oj3fHAyJk)7sVH>=@xKTM4_V~t45k*but2xkg_Mi
z?`FqP@CgnPEhO=su+E~{Y@?R{s(Pn|c#f;)9+Mmu#eB|79L;qlSxTs!=R1rI$I}w$
z^D~fLg$L3D{2W5zfB|~Wdit4;)z^+&8-#~h0X$-H@aLTUKDf7WD+!qXbqUJqbLqKv
z9n*XKnN>epCv%T%LcFG9Lfb@)TZu4bAa6-x@pUb^JjsGv>_IE{=_sx%hfUnkn#=Lz
z9W5IfuoY$iLcr~rJNIpUIs3R$W3J;<@&@C_IxO>~jhF@Nl{(Iy?Y3<QEkRX9ps;Jy
z<c$8Y_zzMO<n7{q9rl+dQ0Kudl&!NTElq#v16Gf%8O{~9o+k8qJlB{Do9nFMdTf<W
zI<=}vp7oWupFiN<W$|~-AGXn7rD0Z9<mf9sK-B$UN$rpfgQ~t<ZddW@ev!vf+Rxh%
zh8C>FULmcJeT)PR=T=|j@iiIYp^V75ku6I@Npwr9kr+u3pEGlttk28?5FIsU*e>Y^
z#+|&|P=>nV7zQaKs(``R$q?s5;ac&})j^B-aqaY~QflP90Ms7b@|&sArlQrZ7u%(3
zr!$j2eA=Iunir-`1WxgJ)9BcQ!5dqtgWdN>I%xI!gO1%q<QVC>jKVpdeKk2yCss9_
z*^W_q#t88;<rXAyBCnJ9L?ao_wRHu4&)z6<41t=$atkz{YxZRd;4Lu-BIp@|c||~K
zxyEIBT!r4Nz{3qC@{&t0P_1&NW))E@bN1w!a@bL>RBS?QqcxgdWVVOTlF;YcAfqGb
zlW#B5$b%f#Y40tQ2IvlfYK8RLvgH))0ha(^L{qEiAw#dMOIwPahYjN&Ud$h^?%0f+
z--dm!?7e6a6y?Bas>ld^#(4+s*CS^^N*3*Qwo_Ps&A-6+fz5YEXZh<HFF*}^%RAnK
ze-`;t;n^|W$<E4?0e2Hl)t1Q#*pS0}Y9qTM%<JS12HlcEybgo;8D8%FFtTB26u47f
zI8L0~*?9L|I&)*(^mJg~Rc;0YP?61O8M|1%RBzp^O^k_0dQ^h8RZ6WC$z$$vrL52a
zKwoP#>&D($_7O1YJU+Z1sw!m32DXuMPAFdvTdT+3dEmIW0RSoqNDGQQJ0B%qujaHM
zf3cA>ddjHt;kBMIG)8@pQ2O@LvQD^dSLWGCzHH`nenoojBqk>%aFfB8+7oRV10#%E
zI&5+=!j|_`81&Es7UXhPw&U|W;a)8r`ZX`#OEk~OM0jp-%q+W%Q0QrOfxe@;LZVKS
zNyb0v80eq6xZ&dLX>|3P(+?plMyexP;9T|J*gCzx5P+eXxxAovp?>Kf=*@5Tn^p{C
zhlN+xYY@kv?HTFgbE%|cl39sbDz6H&eA-b(Jg=dky%j{sXc@ikz9Y`0nGSv|_IR4x
zti;5-Gs8*w9G`t^&B3Fjmv3-7<<7nI-2r#w%U%M3(&!fTJ*op&7QxAJb(IbVJe(aN
zP~QaB;r5MOX)yj7#Z}9Vht&?6%U&aQsB=&;^?(jT;+e0QvwV<)FCKZHPEI#x!H6L^
zt76YVjmH5u=ayvlB&UTCHO4GIsxSilqr|lC-)Oi{ty#V9)#vEuSemDiT=h^n3fQu(
zUu06*_Mnr`oFn&DiS>KYjQy#2bWy?eZ{~@MD*RmfOc4J%RU-bLlA6yd6OG)aHWJGr
zD!ImhKau<Ir1P-#^t5-r=|NMUbJ4n>W;n;$^%SA^D#B%x{m}Vnb~+$SaR^b^OH`gm
zLFH<zVPD*N@&Yc?_Zk-as>OY(QE*im<N3{1_0UIicxFgAS8`7agsIM}3J+Ev)QK!$
z7w(##DtQQT%xSP;tC++XRUS*^A#}J`?n{L3UognNbdhlE(~x`^HoZ7b;N$gNQJWd;
zKnpSC`zFA$<;WKai|CG8P&b!~)7yppqdT~@AC2%Hqk&NlwLH41`+qk=&VLHX!_(Kv
z_Qwdjj8Xcda4yPoEACByZS{x@2l{eBf^+^O@$KLu4{gbVqu&(cmuGK2Wj%)GF6~>(
ziNt@qlIEY*zuGy2J`#6|AWZ0ZLm7V{l3ybgS}5b?ftFUd!K#R4(kh1AunT+nEzfiY
zXgEW$AbhEvY`5ur3V)8o5*U)Mf<kVZ7N7t`0Thn-<Tz&VU}V7N`CMSjl)*?a@-eJ3
zsKy|l`>jJ?&glC(?OSGS(dF^_69O-*&#y6oDIlReZ$^hW0#pEq*P%Sl+^@>Ar?T8~
z2w}-9V}7c0FV-*H1xAYy`V16Ctg23V<!#jTbQs78(7Qopyd-Yep?!;EB`#BM6n68>
zvp^r5IzrF?$SyBVLYyi|YO(fd7FP##%BJ|l@WqZ(u^#mRCSKE~zQs**LWfliAi4aG
zhz4g&<JFjHLbNv0I3@%9_5!}bHy^kueENie&zC0$Fk(>Yd9eQQ2l#B>Vp7zZZV%Xl
zLHA^*y6M5$neaZh_U*Q@W}j0WrumXdN{f*hZrm%&#E#E#x0%V7N(yTmWT85|YWL@$
z@;Ju=)C3y=vEz}LH4a;yhMX~lLULDprey)m@A5`&BsKdFzaOT2FOuRrRa}2gHV(&l
z(|j%--gwR_AUu%Xy2s`~#F?3%EwnfMda2yRsoEB@)UxHQ35Uw~wa-ib0low7V@Y(P
z(5=UEsfIy0xHD76kRjWY{=G{V)C9_TKj1$H*66KQ%ioe{N6GRup`Ixp%EQSKx^}Ke
zJ;OQd6@A^oJbE|?v3<N>zgT)`C&$vSR}A&O4eRwfFlo)?HD!U7w{s%JvOajcaM+aB
zry*kq^c`NT+DxI`M^LDx3S_a<c*qt=?S~gxX{Afgz(e<Q&9K`$)7A{X#;z5_@4(&0
zkY=ra28gNzzZ>n}(Ysv>O&5NKGxbORMgL<Ze~0hx@-%DnGdQXG{wJgJ`<r(6g<pT<
zFw_6>2*1O3D<7?S{0w>E-|$~r$?y2xs`BgS<5>R2?^es-;kz?Slb)a9P0jTW9)1cD
UQkRLD_5cfYBU5uE!$G_HAHZGiNB{r;

literal 0
HcmV?d00001

diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 4ac2678..8d7c252 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
         os.remove('./tests/data/clean.py')
 
 
-class TestCorruptedContentTypesOffice(unittest.TestCase):
-    def test_office(self):
+class TestWrongContentTypesFileOffice(unittest.TestCase):
+    def test_office_incomplete(self):
         shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
         p = office.MSOfficeParser('./tests/data/clean.docx')
         self.assertIsNotNone(p)
         self.assertFalse(p.remove_all())
         os.remove('./tests/data/clean.docx')
 
+    def test_office_broken(self):
+        shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx')
+        with self.assertRaises(ValueError):
+            office.MSOfficeParser('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.docx')
+
+    def test_office_absent(self):
+        shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx')
+        with self.assertRaises(ValueError):
+            office.MSOfficeParser('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.docx')
+
 class TestCorruptedFiles(unittest.TestCase):
     def test_pdf(self):
         shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
-- 
GitLab