From c67bbafb2c60782096af4f6225d94e18225d2ecf Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Mon, 1 Oct 2018 22:26:35 +0200
Subject: [PATCH] Use [Content_Types].xml to improve MS Office coverage

---
 libmat2/archive.py                       |   4 +-
 libmat2/office.py                        |  98 +++++++++++++++++------
 tests/data/broken_xml_content_types.docx | Bin 0 -> 4145 bytes
 tests/data/malformed_content_types.docx  | Bin 4131 -> 4135 bytes
 tests/data/no_content_types.docx         | Bin 0 -> 3651 bytes
 tests/test_corrupted_files.py            |  16 +++-
 6 files changed, 90 insertions(+), 28 deletions(-)
 create mode 100644 tests/data/broken_xml_content_types.docx
 create mode 100644 tests/data/no_content_types.docx

diff --git a/libmat2/archive.py b/libmat2/archive.py
index d812531..b29d690 100644
--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
     """ Office files (.docx, .odt, …) are zipped files. """
     # Those are the files that have a format that _isn't_
     # supported by MAT2, but that we want to keep anyway.
-    files_to_keep = set()  # type: Set[str]
+    files_to_keep = set()  # type: Set[Pattern]
 
     # Those are the files that we _do not_ want to keep,
     # no matter if they are supported or not.
@@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                     abort = True
                     continue
 
-                if item.filename in self.files_to_keep:
+                if any(map(lambda r: r.search(item.filename), self.files_to_keep)):
                     # those files aren't supported, but we want to add them anyway
                     pass
                 elif any(map(lambda r: r.search(item.filename), self.files_to_omit)):
diff --git a/libmat2/office.py b/libmat2/office.py
index 91bf2a6..3abf108 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
         'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
         'application/vnd.openxmlformats-officedocument.presentationml.presentation'
     }
-    files_to_keep = {
-        '[Content_Types].xml',
-        '_rels/.rels',
-        'word/_rels/document.xml.rels',
-        'word/document.xml',
-        'word/fontTable.xml',
-        'word/settings.xml',
-        'word/styles.xml',
-        'docProps/app.xml',
-        'docProps/core.xml',
+    content_types_to_keep = {
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml',  # /word/endnotes.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml',  # /word/footnotes.xml
+        'application/vnd.openxmlformats-officedocument.extended-properties+xml',  # /docProps/app.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml',  # /word/document.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml',  # /word/fontTable.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',  # /word/footer.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml',  # /word/header.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml',  # /word/styles.xml
+        'application/vnd.openxmlformats-package.core-properties+xml',  # /docProps/core.xml
+
+        # Do we want to keep the following ones?
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
+
+        # See https://0xacab.org/jvoisin/mat2/issues/71
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml',  # /word/numbering.xml
+    }
+    files_to_keep = set(map(re.compile, {  # type: ignore
+        r'^\[Content_Types\]\.xml$',
+        r'^_rels/\.rels$',
+        r'^word/_rels/document\.xml\.rels$',
+        r'^word/_rels/footer[0-9]*\.xml\.rels$',
+        r'^word/_rels/header[0-9]*\.xml\.rels$',
 
         # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
-        'word/stylesWithEffects.xml',
-    }
+        r'^word/stylesWithEffects\.xml$',
+    }))
     files_to_omit = set(map(re.compile, {  # type: ignore
-        'word/webSettings.xml',
-        'word/theme',
+        r'^customXml/',
+        r'webSettings\.xml$',
+        r'^docProps/custom\.xml$',
+        r'^word/printerSettings/',
+        r'^word/theme',
+
+        # we have a whitelist in self.files_to_keep,
+        # so we can trash everything else
+        r'^word/_rels/',
     }))
 
+    def __init__(self, filename):
+        super().__init__(filename)
+        if self.__fill_files_to_keep_via_content_types() is False:
+            raise ValueError
+
+    def __fill_files_to_keep_via_content_types(self) -> bool:
+        """ There is a suer-handy `[Content_Types].xml` file
+        in MS Office archives, describing what each other file contains.
+        The self.content_types_to_keep member contains a type whitelist,
+        so we're using it to fill the self.files_to_keep one.
+        """
+        with zipfile.ZipFile(self.filename) as zin:
+            if '[Content_Types].xml' not in zin.namelist():
+                return False
+            xml_data = zin.read('[Content_Types].xml')
+
+        self.content_types = dict()  # type: Dict[str, str]
+        try:
+            tree = ET.fromstring(xml_data)
+        except ET.ParseError:
+            return False
+        for c in tree:
+            if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
+                continue
+            elif c.attrib['ContentType'] in self.content_types_to_keep:
+                fname = c.attrib['PartName'][1:]  # remove leading `/`
+                re_fname = re.compile('^' + re.escape(fname) + '$')
+                self.files_to_keep.add(re_fname)  # type: ignore
+        return True
+
     @staticmethod
     def __remove_rsid(full_path: str) -> bool:
         """ The method will remove "revision session ID".  We're '}rsid'
@@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
         'application/vnd.oasis.opendocument.formula',
         'application/vnd.oasis.opendocument.image',
     }
-    files_to_keep = {
-        'META-INF/manifest.xml',
-        'content.xml',
-        'manifest.rdf',
-        'mimetype',
-        'settings.xml',
-        'styles.xml',
-    }
+    files_to_keep = set(map(re.compile, {  # type: ignore
+        r'^META-INF/manifest\.xml$',
+        r'^content\.xml$',
+        r'^manifest\.rdf$',
+        r'^mimetype$',
+        r'^settings\.xml$',
+        r'^styles\.xml$',
+    }))
     files_to_omit = set(map(re.compile, {  # type: ignore
         r'^meta\.xml$',
-        '^Configurations2/',
-        '^Thumbnails/',
+        r'^Configurations2/',
+        r'^Thumbnails/',
     }))
 
     @staticmethod
diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx
new file mode 100644
index 0000000000000000000000000000000000000000..41e0e49e9fbb843ce24f4b4538adf6e8991b3e3c
GIT binary patch
literal 4145
zcmWIWW@Zs#W?<l8Shv;D_qubrUIZfpg9<AHgD?XFL$q^#UP)?RNqk6UL27ZVUPW%s
z*1+9<hYdvLo(y-13w(6wfPm^-2IG<h_hb7qb^lCy+xG4KPPI&d3t^%r-jAN#sg7GG
z9=-MxU(JMTCQp{lnOI|Z{?+rj%@dT<oSd&2?&`X9;QE3{jkV0p9@%@-1nvKJSslB7
zgX_iHoHF;Frh+~?Nskn4JyWGOeOxAG!KJ%%`@wA+AB2T(%T;ZExb^;r=N#X)`KRTZ
zczydNC(<pUzGkWP*WJOt=7y!ri@oX}(K4Os({DSctc8kaELDH1tG?~|k+@i5)Aai(
z>kAv6ct3lreEEc9ya3-8AAhI+Ypxy_b-p9CZHK0V$hpT0`i1JaAAb<K=rD1YkHX*b
zpgh|-sx#U2msIY$TmJfJz{d!_b1^&KEos_l*`jck>m~D%j^)aA-!h7f7It1!608m_
zxN(%jKPTD6Wb(5OQ~#bg;^%Zn)1pY-_FHPh%r(=NC3|(PIkw*<dnr@PrdevnjQ;ZA
zh+yF0;9$t<vGZj>0^AG?4Dm&&ImP;VAad;_UvFkdp0@XYMV;T>&C%{SaqgRvQ@+wy
z_LiA%-d#AyZk)OH|2;>pT>)JSj_;}b{6XeY?zw*p{N{9bJ04z@=~!rRahE|>#cf-C
z`Gm{*yTa`zT=AM2qqXW@%c4Xf&;E|1K5tf<7#6OWt(fK?blfIMY4h@lg?G1opHXkL
zV7}qiYgs3vcQbAM?eWLltV(Q?N{d=r+_ISUn=SN2{VG!~CM>R8S@<){WGYWZY{uW}
zg%&*LJ_MV%pLbkW`7zQWV%zqq8cU<y%KvAY<ky_ub3Er_+@tQZiw@tE{xUzno1Npu
z1xDqY3=9mmObiUTLqvdqfgvS7IiM)Npjbb#pa7IE=T7wXW;PUPd;hOX^~){as8r2K
zz8{(WS-IC;i|#yGV!r98!pv#k?n_5Uiud2UmwxZz+><T({oC~U1SWr6r?z?0M5Z~P
zZtd8nw*Ktp634EVOPM#T*-b4SmAng&UcboRk)a@Jd2s2hl8%SdE-kUzU^Mk++0C0j
zGgQ-hbieMN8~EYH)q73OR_SXN6}S{geKFx_d@`w``oRCTto8f8d*`3kd}4D@dc(r$
z98H(wZ&pmHP1fO<cAELaZh@-BD$lw*rvB{sP$TEKdeYQT%lT`1Zp&|f-<=hGS7dQp
zZrveujU_#iW;^fCj`M#$r|<EPsBIID^jz+KAT9rOrhb!siIG?9Px~XL%ax@prJL`1
zPvH9ZrRP<p-;~@b3x7+R1++c9@Q51}wl;A`{NxxJ82&Nh3tK_tuuaY{N(BXOZ&0lN
zAp?=x@8LSN_jef@-q<j4#Y0aaVFjuCS=EwPw`Siu^x*#O+?)lU9^AJ5K4;$Vt1sUu
zoKE0B*5#F|(U~ODkR&R;>coXAQ+{=g4SiaEr@b88`Vubc?4N1m8<G0pXoysgZ~}Aw
z5}iaV=ji7q2XAcB^~h3QA29Duh+EL}g_;|dz3CQfa4)QBSG72`a`6sBp05u&{HD#}
zomQ28>RN*_kE%z&xkoBqA0l5L@TmUkcsfR7X@NU$_)=TT75rg5hjy)7Y?Zro-+z|<
zy7Q*LU9m9W>?(-`HC0(H2FzC&`S&osJoWWV_OBmXW-YD}QJ-wEkN<MT)H{>EOzXM)
zUGv}9eTHuyoqEDs&1Uy3VW!}%pVDPd`df~xh8{hqZ^JISZu;6357Xs3D@(cSzw|Qv
zEBGQ8UR3M;Y0B^a9?3>h@jX|hw@l@I<NEpgrzN^p2bN{;{{4gH*{+Ff50cemIAeG5
z|NnFRk2ENPwyhRwO=4tVxXpwwf@ByN7|QdDQuHBZLP~yeX>Mv>2`HK%<-<f@u0sX_
zt>5P<ow>IxRAAz=JeLCPuMCEBw;eR&JgHGt&D<5G@#xUgKeg`-_2c7z_r+dd&+To{
z@oZcexb4z1!2*r#msQtvd+s~#7Zk&}wyg1}v@pvSm%O8u0lMFeRIV-ToP6t7T#!!f
z;khB|Hx=eDecANepG|~W?RF^BpLM4irkc0!-;f)d-73BMoUN49#0QOevTYty_BjfD
zSTXqoqn)1p@8vbQyi-5jczFBZ>aS;y{8?}BuxQ59=i>9`E^qZb$No^P926Ws^$iXj
zWnf?s#v2@<^a>6R<bYTj;+uclfM@UfaEV>&+paG<(POojMY=3PP~j$jK*~)qwN572
z<7eysF8Q%KyHPPhf7wd+eeRbQ$<L4L@jEB9mnnKv&D@PHQH)bJ`JSwk+rQ3C=i`yw
zqoxs&o4UWe$^89tqrR5F+=<gB9Z0BURDFFxEN;uCg8fEb3Kc6dRS$>+T=zP0af2bV
z_cfuNO1{UJm|A2qhcQd}9XzTg8&tM9?3d+fxu7Ytf>&JMe(GoE+duk$H(v~nsqfC>
zkFaD>+{@o=P<6pH=H^Vb^LZH-pUz0W6MM>4`QPi)d#5ty9R|P3rxZ)R4mf79SKIr+
z>CNUf?bEDxZZMTk{@r$AXXn|D=v}5V^`&}#XIk~Ef}$=yz4ap~r0)O5Rrk)l`Tyh5
zU&BQ|?nOA%T}l*W3KUwMe0u}8`U<^^yIv>vhZisCp5_@pXQ$rH;x|#7O_U^<FCFdU
z{K5B{`<B#AxA|?C744MbxTKQx@}_P4)cS4D#O4XveEXS&*450b?{t3KZMMfg=bc&b
z0sg*+A*W|0UASA@<9Kz;`a5BtyxLOFE^>Y#wuEhgq$Q)^4AvYgse^&9b{ZdFc;yz$
zFE1|BewDZ`TPEMyD>1(}a&qau6=^Fwe9C{vvB#Q+UK(|j{Nj1}DgV|#Mo_w_ZFc{1
zlaYbp0SmrtA_U7OY5932A&E&j;DT&zfTQ1G1A#rCxlWhA+QQzlXql+cqn4@#PZ^`r
zFGp;;KJmKBm+w3Lv=vV*TyXEr+?z9RMy|7;D{9MTHc!iPzG#p_X1ugj$fw%xMgKKe
z-4{r$QD5^!FmhdZ`RRo@Hm+isb9cyS-8^<<s@KN6i;U-M+NW$d#?Ee7d)SfJ?n>+j
zshN{^ZO#f5;Xl==SUHXJ`iURQ45jAp{a4^oD8iYcxQu&;sqE>imUAV%n5H|u?{OBB
zmN_!J++pT|XpSCrKmKi#Tops_F{i)zXv8~nUgJ5JyK&EKS(vN4)jfLC_WNvi-~PSq
zQg&hX`sFS&x6QKYW<UJLe|7lg;}cGcMe{5>xh~<c+CuTU^J`bHO!*u={e!KffIq8r
z^)9<}^Si9<E55RV0{7tYn;f!?3=E;n_yQN6>55ZJN;31(i$SH2YoE8!Ap;)g?{kzM
z+p27Ma9QzfA2$O#$J)koDrIVBA{MrZM}sa+cKiKq>Y~b|i4#vAE_!%C;j2wg$>#z)
z*=NxPiyGSmW?ncf&AO{>+k7d<t6rTexmC_SdAM$tboSEy$y}Qr^n`5Io-VTGRMBSP
z>5Ag-0+)mZZ_$2!&8&6eg0++7pQS8QOMW@=%Hea3e}6K^9aes&8Pj)ar_gx;+nhV^
z#Bcwf`VCwWcCiG8EM;I|IM0C3kNmI%R9sS-lM43X+0b17+Xe#v_QebStoC&5SeV7Q
zQDIBoirE@}8dev3>d4&wY&qrA>-626+O>k~FMV6uIsbF)yy|(Ezn*?st326yi^`T$
zH~!t5pt@p*Udt+@vf^4pi!@Hl09KE+2FctiWmTVEp007pQ=BL?e|?6@`G~i3B$sM_
zl#*Uxn;Y;mVr4G#Ir-eF=eFwhT$Smp`RTTd{qV!UTMYqg^f%Y<2wWj}C+pJE+mCm7
zmPT^*ZJgqNU;fy3w{NUc|AdryZ`{7dQ)A2Qccw)oPw?c1B@Y)DF5S{P^UK{TyEjL^
zM`hIYscyLxXmKlkn|0apA3H9b3N(^uEL1cq(@e{#tz0`vGb+$Xe${!Aw;YNStd$uh
zXJ;&*tf}Gk>An4=%y}`1SG_ho6jPtSHv8eE;yui)q7er#efSx@<y8C*=9wFtRbBY|
z4<z<K+0y435Zb>`>W<I+O_eLyf2ueb2Ufh=D$%2^X!usj|N2vd*ULZ5=yp0^tUuvu
z(afA(Pp-ax*><V$#Gm7L4^>$6M6A%6VfCSNVWHrcPv=g(nB2tnJI>1D+mGK$yXM=S
ze^I}2a`wvH`&?W6{ufVQd{6)D6KSUiMd@W<DxR1yA9}R>R@#h1)z=?i|Mz#bo|yI8
zJ+3B`r?R9+znNliz%{3)t&TC_Lb61M)5{CLYRe5$)|=k9-g)@9-lw?N;XJnYl3skQ
zuBp6Ibaw`Wn&PiDrgMYpmriwh7%F`&b<SnJk3RPD8%3Yae^Ix?_xpCUQq2>&CL#xO
z_u9xVpK7}0?EhH3y;HOJ)Xtn@yDeAm_eb}#|60Rq|Ch#9T~yC~e0*ZrnYtO@!q5DB
z$0>UH{+zfy4fj@Ej}WcczwnySUxT%FhyMNR_4~&F%AOf6v$*P*7#M;%K}na9$(|Wk
z=S>CLgHUML(g<Q=>B@02D1c-b7#SoO8hRPlACJ7k$Z%V70f>h21H2iTL>NTi9TVgR
zI;dj;HULa?GNN=-&^02rK0)nwupTfmn+dZukFFcJX#^^Kz#75CcV;Bru=XmtcI2i6
zsQm=i2_|N+;%`l%n}l5XpgLqZ8`LD^h6uWL<l+@nLxJ4@CiZhcwWHKm==zb1E>!Df
txS;xxi)3`|$jJv(zJYCq5Zo~B@X{{8o0SbD$H~CSkif*iu$vde0{~d}!bSi9

literal 0
HcmV?d00001

diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx
index 43ac7437618f8f49e52c2006526efa087cb0c011..cc5caf3515b228391273bd7f0ec615ab0bad915a 100644
GIT binary patch
delta 505
zcmZ3iuw0=&z?+#xgqeYXgW>yDN8io!9GChrGBEJ5GB5};Ffc?r=jWBA=9R>UR2HNb
z$Ldw&=4=hz?RVHfWbVmum$<-3hYkp+zGWzWd7!VS_;S<{{-3)R{;gi;DaH9K!1cCf
z=eeEFrxtCyn0u?yJ=OQ9+$T=&^vg?XZd)CgV$dua-THv5{#@WX2ktX#nVmh7_lh;z
z{heg3Xn$1b$K#YwN0yp0`kZjOlCak!QG46BWfCSvYnN=#m_5OBHTP{3uR|Yh#QrGm
z`8`=&`}!=M>U#dQ93kxCr+WY0)%@?bJ!jUPsq0Nz!cCW)uWjAobhM>-?Iin2dyK!%
zm$-Ss_J3-%Kg%?|7XIS;g`Um(g?nr!P54+}{k8P<!aP3dU27Jeo1?ynpX*z1=N<(S
zOV8(mPsGiqUOOB)ZDNJ$`OB~CBIEgm`yUHDKUM55zDk!daHi@><ras0_fKyw%y`nK
zDCO<+XmRUdO}8J%4BcF~Q~BH_HGg|PGmA0iY0Kqyuk-Hjm7KKX$wlGio=l&3CvW7C
z+B}s}g@+d<1|}clH4$TQy~TJU@(Lp=Jz0QHhp~RLEuRJB#K~oRwv2BlZ{f3J>SLX}
rj8$&35Wg1V+{w25c8psm*YMjj3QgX_Z^!s;@^5}6wvD_D3=9kaP=VK5

delta 502
zcmZ3kuvno!z?+#xgqeYXgW=5_JKyzYYo2&9GB9wnGB5};Ffc?r=jWBA=9R>UR2HNb
z$Ldw&=4=f-?bqxe5_>v)!cD0euMV*`mMzSFduXp@ySMek%9M8^xA%YV3lQ3xD-|8)
zaBk;w*PS<BnY}%*c+;|^@}E6RyDNkCy^~4snsKmet#biu{d0}$3s|3BYgAmaF|ONT
ze%;Zz6XcUv|J0dPKe#lDK`liwYeU?U4WhRzf_bE7hFyx@;(KDr)z(}o)x?T7`hP5s
z{XIE(s<iL4z5Cn4SXMP>KXv>6Zc6>~Xfyx3Q{qyGR!dzvzRxLUVbY<UVJGuF@6G%b
z-jnlU-v14IRhhh}A8JppU+CGqU$Dn!(u9xo)n7|rFU;eM-nC}oxjE{K___W`30Hb*
z6iqs(d_q4@Yi;{x9p#T+^Oj%#yJer8n%r^4b5o!9>4iiyEi%%c<j&D}ukXpt#fB$a
z6s5eK9_?*Ctm#(qNJ4p0t5N&nH=4rg2RCFtJYoNGqoKp3DK=G)Y&`cd1b9yt;gZ-q
zlTn3-7bW;7pW-zUYnUF~_9Ws8V}tF-W-txrPZr_RVXU6)$Y;UWJGqL_mht7}9ej37
vU96K=vC2&r<JV%GIoXllj&bAU27Y@+zR3sp?HE5z{?D((ww9NHfq?-4o{io#

diff --git a/tests/data/no_content_types.docx b/tests/data/no_content_types.docx
new file mode 100644
index 0000000000000000000000000000000000000000..d0e0330fd236d7752b4c3660f70c5c1e33b601ab
GIT binary patch
literal 3651
zcmWIWW@Zs#;Nak3$my~3Wk3Sl3=9nMMX5Q(`g$O8?Id4sW=Ed3_kTs5-`&m8?l^Jo
zo03z$(pUDDnQz`*ILB_Bx%U4(N3LA~T?>xysr>vw=2Gste+&HPbay)*UX|%sXmD|t
zK~}|WTYdS2%lf;*?Iv9Dni-?D>R!vDL?O@qj-x(rR+<<Vu9&Tu<{xz2CP``Y@`;6a
zw|$>cZ?s^(;nizdC!%*VZT;=>$K0$+Y?DfhT3Xz)nDv`2^hEtCQ!XYfu3TC8Gs|Qu
zPep9T-|B@HJm)?Ho4B8MTvz!q(jsEp_Nf|6qut8?XPV^KoZfRh=VIKW?z4*y-<19`
zKfs%v<HZF=<(mu)47N-R47fu?fPsM_B|kZ!D8HasKe3=duOc^R?nGa2W<!Ct_y4+7
zzufYTO4Xd?`;pn7m3!T_=+2WR=9_*h%$)Y^zI1e?c>le7>GvMaJ=vn)zfGS{VDiUx
zYMUobWSaBo){bpz>(5>;aqMcjlzFq7-PF=i$-D6A^^5Es849A72baz&>3BHp(h{o;
zMpJK=-MslTLp7~O_v`MtfgfI6z1QSymA+<CflGnZ7ZaYwCzC3w5BzV-TEFkRcm7$;
zCpHJAH!Pga(R4ZfX2q1+WF3xar<p(O7N}aR@~pdK>d%f3HFA!tCru5toWG{$w*2<@
z-C5ChMHaW^)*VvUSke<|w)6h%IREE!`X2v?+BV@x&*kn1((+$t>NnY!7<sk+v_E3H
zTv^Iey7{j61g?KydR|rfP05|I@VBH{K-<F$kGMf$YZG_GPmYm+;U6QuuoXlO+vNPB
zR8Zjd2F3axG7zc#9<Eb+f0v=*jSUl5JoFS2R*<@%RV{gSYxb=}5ANU2%~|m2!EM{`
zbLRcN`tps!=>+~`U0$ggok=1MNuuJbPF$!m<yY6((5K~h+RL%6FX5uj{+UL;5vdQ3
zhDh}YCotzP(Mhy&j(%=(@Wv)xk1XZ&0rT#JxCK35sJUU;n{KfN_rjWXRf|(A7w<6S
z`TC&4Z`vH*X;tZ`t~D6*sCopPd!*v^A@cPBkLs_Er(-mh7P#|<FSWH?!5_wRXxF;M
zR=G>}{b$*)J8$~i6$=B-u98?#Q<c?Xz<h;~e-Go!Q(w<y|N60I*5Vow^~nbN_%ByX
zy)*gCw4Te~HUE9xXZYsPsVBVEY<ABQW(wZ=DP8uYzvZ}U=+SffHteG7rms!$FkP;*
zvXs02OE1H}f-iF6MYZmqru_czk!&Ot-*ZKJ%T(SsuAjeuTB2)pU|II=-#=KM?V8B;
zAXz<zGj<pM|3An7NP{A1+iIcKBt`~?+f4W(NQQxdp*+7RMITZoq~s@;=BDPAfT9Uf
zK1}rGI%FWw`hA|#nS0AZ1tu=bb1BgN%3wHm+d(tVlNwdk%w16$j}ATkQ~TafKR*6<
zU+e|;+};Kq&&Gv;+b%5=EYR3~S#?df=f2~9K{1?b%NmbL3$tu-$vavZp!>~8<=Vo|
z$+v#R1?kito*SZmQ(^wnmrcL@*+iJtZih1cS$C>os(JhV4Y{$|t<tN{*-A-Ge9)LD
z+vYK4pQF%+6_Zae+UeQ<US5;SJN46zhqn)|{(APvpY`?*i)K81E<SJW@>b7t><`7t
zLBa7;-{8Pe1_lOUyukrVui)T74v3{8zWKKec=o;zm)NDg?fRk<Jyv^Jq{|`%6>jne
zq}&u!>tu30ezxxKk{_$H8x<q;m#uW)=YDCC{QS5czjH!+nW8t<%-!e`#W;17@5ws3
z{p-whJ|4+EY8nx_sr$>D%-=6J>T3zioj7gMfrMH{)z=rq;<j8W*l*;eP_ZIY^?*pg
zb*~c_HyAQ|UlZD?<a>OHsYNDp7_*e$!J}%jL1l}>ep#ND3z{-3c*XVYr+#+6{iFYP
z^Tptp`tB_L2ul{lz5LAvRToTSZq8IYpO<0r>5Sw%v8P;>|GhrFcPeAvVeqSbO0neY
zfMXVWwY?vl-fUjeKFxaP22=Uu-)$FmcAo8s-eoFNU#jPKrd7WxDC*+VTR(zA>i%zB
zb?@Ap|34o6HC*)LUW7y4r9?rdK%v#iw>NOBuh6@=>veK}c=3YnX`b<OcIw?MeiOCX
zL`j1A($PN7AAGO5Z%N&Bo8NX>(M~CjODb6}Z`#IBt>5-cY@U$Kx1U*PUCq4uPUpAX
zW_#>&-kAj-;O~1Fa(Y(Mg}b#qj#sy=zZ3S!t1b2HBIgHUOV}1jS~3dGV9l|TIvDtB
zr}6QHS8lQV^5Qb>SBdMgW%8}P67zc_CztM9k+!nKr~G#ud#rirrBO%8FP@j5@^AfP
z1f`4GX7?{Q85tNJu;9xkLa=O-mY-J=l9-eOF38pfIQktn5ZLpX>vZ|6E$l6emWc{I
zYN=ZAlrcK}a>S<V6R)d$`M$$XTk*ug1^3>}y*cw{<U0GgqPA>i^Rz7Iiv}rV#!Fj;
ze5(Ck^k0M3eSy>(^)*ifBiDtOpI(?_<0_UpcZZDD&0{yFdTq?R$aub{eaePo?CgfM
zhaGwCuEc(jnmKvb=Bz*w{!@*LmD4z{pZKxNP-_0(e+3?eBAgkD%eZ%#%AUSzIak7q
zX}Z(<9%nIWnIp5y9cC_w=IBxP<KH&PRWbA)bNZW)M!YlUHJ)?18~4nXg}J(0-J>^c
zzt48}?cd8TWfx|zU+yw<+bo-I_QQYtSBGytKH;=jG|#e=>k=NTEfk+SzjpP?l+V%A
zKiEnN__In^@3K2Lzst(L;wvjCa1S28$sx<gz!1ueFL2?Rt~j-%Br`9)7*zVW_IV2(
zGT?FkK1b=Xt;&W6mlfakaWk-UtZh7}Ql@4mVqu$jH0aV~x8LukE~-qLIPv7+qK5|*
zzS{JZd@it)eHLx7sIg68=7q!3th>s#&6je#>eacDTjlJNhwElZXD{8K%(dx3PsnEN
z=^|TB6>S!tt|<O4a7kG37VYQP%vvWdSUXw%S;{iC<d+k#96s0h_a}4QVdYnvF@2|Y
z3Y{0Q&AIbV{PzE;-@p}N7fWEsQU(Tw^9=a>$PY_E#U+(FsbC+T4bAnxZ6NS(U%cSY
zYEQ?Gg;|Un6}IH9n62@rVRf;mj?C@PmQy~xPT$R`T`Rc$(zm6Z^FPPVtDblH>*<%Z
z%9E|PsBAfP<KMjrsw;NrwX8BKE3P%PNaM5&VD(sQkj$-8R`u!S=^B?j#fd`m*JqfV
zk9a#ra;fG=Dd`2axdA^TR^~FFlh2)cZmVw3RhiD3pKi<84?hgN)ex{oe{=ngz!idb
zvMw#X{dkvWX(U(Q#wq^y<&SN5`^GBuPe_UP#_el7HMY!tXIez^1W#^Q@^EqC(k-ns
zzuc{|dvoM_R7PE&>Xu7^7PsQJS(h#UvE#z2KqGm^LPeu8&9sc#%C(a;qXLcOSDhDm
z%b_^ITA5LDcE<9_ni^i8-rG;goEMXL)oa5;G4=UtvmZ_>-owl)8gcN_ho8|~PQ~wF
zp1HAE)rG(RKw|%sEq$H=q5TV`?)c2#RJnrvr;3AdV8yGg5<TjQhHs_(uRk?-z5K(B
zZm09b`V+1e&CJ>L<m&5}ZI=p9{5gL2P=z&5#0s4mRv$VS77Bj(bneuP$xUp(<E$*c
z{rIi4Yrft27xfz_XRpk?&$Y$xfARFi_w>I$k#>qulwS6w;)x0Kp-0ParOh~0ef{zE
ze}7l&iCM4R<7zT_Doc9wn<)kdTytvL>KGF)BujKSy}a<Nw%j0Pz3F}HorizxeTsV>
z&SQHo>BYzDn#wChcV{rDDgIhxIyb0(=~SnOq0-k<=UnFd=wmOxQS|Bj7j-*)zi&4y
z)jW}FB62WyuZ`^Tsis@b{*Tq$J2i_>?aV2*+j8}Oe{?VVuQj~(e`#FRMfKdr$0wGZ
zshjaF{LH_1oT8`i&xza9aBtQ12+@lD3$F?NHCStR=-<Cyzkdv%?3v**i>r=_fgzX^
zlyn)HL>Lf#2IRIgsLz1Zg+T2=1bCxrL~a;@n$`%7LX4OVZFJqptrk$ZfY6=CgrpnR
zltkB#T+4%+HVEzfEchEb=q4f83#bl}V}+W8+(tmxj$HDADi4H1%-EsYQK}Mj{m7*X
vs`ZmOp!$(ZOLXnXF$*es5Z0G-!nDH+q5yAJHjn}i1`dYbj0_A$JRlwbz+LW0

literal 0
HcmV?d00001

diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py
index 4ac2678..8d7c252 100644
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase):
         os.remove('./tests/data/clean.py')
 
 
-class TestCorruptedContentTypesOffice(unittest.TestCase):
-    def test_office(self):
+class TestWrongContentTypesFileOffice(unittest.TestCase):
+    def test_office_incomplete(self):
         shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx')
         p = office.MSOfficeParser('./tests/data/clean.docx')
         self.assertIsNotNone(p)
         self.assertFalse(p.remove_all())
         os.remove('./tests/data/clean.docx')
 
+    def test_office_broken(self):
+        shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx')
+        with self.assertRaises(ValueError):
+            office.MSOfficeParser('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.docx')
+
+    def test_office_absent(self):
+        shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx')
+        with self.assertRaises(ValueError):
+            office.MSOfficeParser('./tests/data/clean.docx')
+        os.remove('./tests/data/clean.docx')
+
 class TestCorruptedFiles(unittest.TestCase):
     def test_pdf(self):
         shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
-- 
GitLab