From c67bbafb2c60782096af4f6225d94e18225d2ecf Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Mon, 1 Oct 2018 22:26:35 +0200 Subject: [PATCH] Use [Content_Types].xml to improve MS Office coverage --- libmat2/archive.py | 4 +- libmat2/office.py | 98 +++++++++++++++++------ tests/data/broken_xml_content_types.docx | Bin 0 -> 4145 bytes tests/data/malformed_content_types.docx | Bin 4131 -> 4135 bytes tests/data/no_content_types.docx | Bin 0 -> 3651 bytes tests/test_corrupted_files.py | 16 +++- 6 files changed, 90 insertions(+), 28 deletions(-) create mode 100644 tests/data/broken_xml_content_types.docx create mode 100644 tests/data/no_content_types.docx diff --git a/libmat2/archive.py b/libmat2/archive.py index d812531..b29d690 100644 --- a/libmat2/archive.py +++ b/libmat2/archive.py @@ -17,7 +17,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): """ Office files (.docx, .odt, …) are zipped files. """ # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. - files_to_keep = set() # type: Set[str] + files_to_keep = set() # type: Set[Pattern] # Those are the files that we _do not_ want to keep, # no matter if they are supported or not. @@ -89,7 +89,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): abort = True continue - if item.filename in self.files_to_keep: + if any(map(lambda r: r.search(item.filename), self.files_to_keep)): # those files aren't supported, but we want to add them anyway pass elif any(map(lambda r: r.search(item.filename), self.files_to_omit)): diff --git a/libmat2/office.py b/libmat2/office.py index 91bf2a6..3abf108 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -50,25 +50,75 @@ class MSOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' } - files_to_keep = { - '[Content_Types].xml', - '_rels/.rels', - 'word/_rels/document.xml.rels', - 'word/document.xml', - 'word/fontTable.xml', - 'word/settings.xml', - 'word/styles.xml', - 'docProps/app.xml', - 'docProps/core.xml', + content_types_to_keep = { + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', # /word/endnotes.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml', # /word/footnotes.xml + 'application/vnd.openxmlformats-officedocument.extended-properties+xml', # /docProps/app.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', # /word/document.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml', # /word/fontTable.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', # /word/footer.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml', # /word/header.xml + 'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml', # /word/styles.xml + 'application/vnd.openxmlformats-package.core-properties+xml', # /docProps/core.xml + + # Do we want to keep the following ones? + 'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml', + + # See https://0xacab.org/jvoisin/mat2/issues/71 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml', # /word/numbering.xml + } + files_to_keep = set(map(re.compile, { # type: ignore + r'^\[Content_Types\]\.xml$', + r'^_rels/\.rels$', + r'^word/_rels/document\.xml\.rels$', + r'^word/_rels/footer[0-9]*\.xml\.rels$', + r'^word/_rels/header[0-9]*\.xml\.rels$', # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx - 'word/stylesWithEffects.xml', - } + r'^word/stylesWithEffects\.xml$', + })) files_to_omit = set(map(re.compile, { # type: ignore - 'word/webSettings.xml', - 'word/theme', + r'^customXml/', + r'webSettings\.xml$', + r'^docProps/custom\.xml$', + r'^word/printerSettings/', + r'^word/theme', + + # we have a whitelist in self.files_to_keep, + # so we can trash everything else + r'^word/_rels/', })) + def __init__(self, filename): + super().__init__(filename) + if self.__fill_files_to_keep_via_content_types() is False: + raise ValueError + + def __fill_files_to_keep_via_content_types(self) -> bool: + """ There is a suer-handy `[Content_Types].xml` file + in MS Office archives, describing what each other file contains. + The self.content_types_to_keep member contains a type whitelist, + so we're using it to fill the self.files_to_keep one. + """ + with zipfile.ZipFile(self.filename) as zin: + if '[Content_Types].xml' not in zin.namelist(): + return False + xml_data = zin.read('[Content_Types].xml') + + self.content_types = dict() # type: Dict[str, str] + try: + tree = ET.fromstring(xml_data) + except ET.ParseError: + return False + for c in tree: + if 'PartName' not in c.attrib or 'ContentType' not in c.attrib: + continue + elif c.attrib['ContentType'] in self.content_types_to_keep: + fname = c.attrib['PartName'][1:] # remove leading `/` + re_fname = re.compile('^' + re.escape(fname) + '$') + self.files_to_keep.add(re_fname) # type: ignore + return True + @staticmethod def __remove_rsid(full_path: str) -> bool: """ The method will remove "revision session ID". We're '}rsid' @@ -270,18 +320,18 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): 'application/vnd.oasis.opendocument.formula', 'application/vnd.oasis.opendocument.image', } - files_to_keep = { - 'META-INF/manifest.xml', - 'content.xml', - 'manifest.rdf', - 'mimetype', - 'settings.xml', - 'styles.xml', - } + files_to_keep = set(map(re.compile, { # type: ignore + r'^META-INF/manifest\.xml$', + r'^content\.xml$', + r'^manifest\.rdf$', + r'^mimetype$', + r'^settings\.xml$', + r'^styles\.xml$', + })) files_to_omit = set(map(re.compile, { # type: ignore r'^meta\.xml$', - '^Configurations2/', - '^Thumbnails/', + r'^Configurations2/', + r'^Thumbnails/', })) @staticmethod diff --git a/tests/data/broken_xml_content_types.docx b/tests/data/broken_xml_content_types.docx new file mode 100644 index 0000000000000000000000000000000000000000..41e0e49e9fbb843ce24f4b4538adf6e8991b3e3c GIT binary patch literal 4145 zcmWIWW@Zs#W?<l8Shv;D_qubrUIZfpg9<AHgD?XFL$q^#UP)?RNqk6UL27ZVUPW%s z*1+9<hYdvLo(y-13w(6wfPm^-2IG<h_hb7qb^lCy+xG4KPPI&d3t^%r-jAN#sg7GG z9=-MxU(JMTCQp{lnOI|Z{?+rj%@dT<oSd&2?&`X9;QE3{jkV0p9@%@-1nvKJSslB7 zgX_iHoHF;Frh+~?Nskn4JyWGOeOxAG!KJ%%`@wA+AB2T(%T;ZExb^;r=N#X)`KRTZ zczydNC(<pUzGkWP*WJOt=7y!ri@oX}(K4Os({DSctc8kaELDH1tG?~|k+@i5)Aai( z>kAv6ct3lreEEc9ya3-8AAhI+Ypxy_b-p9CZHK0V$hpT0`i1JaAAb<K=rD1YkHX*b zpgh|-sx#U2msIY$TmJfJz{d!_b1^&KEos_l*`jck>m~D%j^)aA-!h7f7It1!608m_ zxN(%jKPTD6Wb(5OQ~#bg;^%Zn)1pY-_FHPh%r(=NC3|(PIkw*<dnr@PrdevnjQ;ZA zh+yF0;9$t<vGZj>0^AG?4Dm&&ImP;VAad;_UvFkdp0@XYMV;T>&C%{SaqgRvQ@+wy z_LiA%-d#AyZk)OH|2;>pT>)JSj_;}b{6XeY?zw*p{N{9bJ04z@=~!rRahE|>#cf-C z`Gm{*yTa`zT=AM2qqXW@%c4Xf&;E|1K5tf<7#6OWt(fK?blfIMY4h@lg?G1opHXkL zV7}qiYgs3vcQbAM?eWLltV(Q?N{d=r+_ISUn=SN2{VG!~CM>R8S@<){WGYWZY{uW} zg%&*LJ_MV%pLbkW`7zQWV%zqq8cU<y%KvAY<ky_ub3Er_+@tQZiw@tE{xUzno1Npu z1xDqY3=9mmObiUTLqvdqfgvS7IiM)Npjbb#pa7IE=T7wXW;PUPd;hOX^~){as8r2K zz8{(WS-IC;i|#yGV!r98!pv#k?n_5Uiud2UmwxZz+><T({oC~U1SWr6r?z?0M5Z~P zZtd8nw*Ktp634EVOPM#T*-b4SmAng&UcboRk)a@Jd2s2hl8%SdE-kUzU^Mk++0C0j zGgQ-hbieMN8~EYH)q73OR_SXN6}S{geKFx_d@`w``oRCTto8f8d*`3kd}4D@dc(r$ z98H(wZ&pmHP1fO<cAELaZh@-BD$lw*rvB{sP$TEKdeYQT%lT`1Zp&|f-<=hGS7dQp zZrveujU_#iW;^fCj`M#$r|<EPsBIID^jz+KAT9rOrhb!siIG?9Px~XL%ax@prJL`1 zPvH9ZrRP<p-;~@b3x7+R1++c9@Q51}wl;A`{NxxJ82&Nh3tK_tuuaY{N(BXOZ&0lN zAp?=x@8LSN_jef@-q<j4#Y0aaVFjuCS=EwPw`Siu^x*#O+?)lU9^AJ5K4;$Vt1sUu zoKE0B*5#F|(U~ODkR&R;>coXAQ+{=g4SiaEr@b88`Vubc?4N1m8<G0pXoysgZ~}Aw z5}iaV=ji7q2XAcB^~h3QA29Duh+EL}g_;|dz3CQfa4)QBSG72`a`6sBp05u&{HD#} zomQ28>RN*_kE%z&xkoBqA0l5L@TmUkcsfR7X@NU$_)=TT75rg5hjy)7Y?Zro-+z|< zy7Q*LU9m9W>?(-`HC0(H2FzC&`S&osJoWWV_OBmXW-YD}QJ-wEkN<MT)H{>EOzXM) zUGv}9eTHuyoqEDs&1Uy3VW!}%pVDPd`df~xh8{hqZ^JISZu;6357Xs3D@(cSzw|Qv zEBGQ8UR3M;Y0B^a9?3>h@jX|hw@l@I<NEpgrzN^p2bN{;{{4gH*{+Ff50cemIAeG5 z|NnFRk2ENPwyhRwO=4tVxXpwwf@ByN7|QdDQuHBZLP~yeX>Mv>2`HK%<-<f@u0sX_ zt>5P<ow>IxRAAz=JeLCPuMCEBw;eR&JgHGt&D<5G@#xUgKeg`-_2c7z_r+dd&+To{ z@oZcexb4z1!2*r#msQtvd+s~#7Zk&}wyg1}v@pvSm%O8u0lMFeRIV-ToP6t7T#!!f z;khB|Hx=eDecANepG|~W?RF^BpLM4irkc0!-;f)d-73BMoUN49#0QOevTYty_BjfD zSTXqoqn)1p@8vbQyi-5jczFBZ>aS;y{8?}BuxQ59=i>9`E^qZb$No^P926Ws^$iXj zWnf?s#v2@<^a>6R<bYTj;+uclfM@UfaEV>&+paG<(POojMY=3PP~j$jK*~)qwN572 z<7eysF8Q%KyHPPhf7wd+eeRbQ$<L4L@jEB9mnnKv&D@PHQH)bJ`JSwk+rQ3C=i`yw zqoxs&o4UWe$^89tqrR5F+=<gB9Z0BURDFFxEN;uCg8fEb3Kc6dRS$>+T=zP0af2bV z_cfuNO1{UJm|A2qhcQd}9XzTg8&tM9?3d+fxu7Ytf>&JMe(GoE+duk$H(v~nsqfC> zkFaD>+{@o=P<6pH=H^Vb^LZH-pUz0W6MM>4`QPi)d#5ty9R|P3rxZ)R4mf79SKIr+ z>CNUf?bEDxZZMTk{@r$AXXn|D=v}5V^`&}#XIk~Ef}$=yz4ap~r0)O5Rrk)l`Tyh5 zU&BQ|?nOA%T}l*W3KUwMe0u}8`U<^^yIv>vhZisCp5_@pXQ$rH;x|#7O_U^<FCFdU z{K5B{`<B#AxA|?C744MbxTKQx@}_P4)cS4D#O4XveEXS&*450b?{t3KZMMfg=bc&b z0sg*+A*W|0UASA@<9Kz;`a5BtyxLOFE^>Y#wuEhgq$Q)^4AvYgse^&9b{ZdFc;yz$ zFE1|BewDZ`TPEMyD>1(}a&qau6=^Fwe9C{vvB#Q+UK(|j{Nj1}DgV|#Mo_w_ZFc{1 zlaYbp0SmrtA_U7OY5932A&E&j;DT&zfTQ1G1A#rCxlWhA+QQzlXql+cqn4@#PZ^`r zFGp;;KJmKBm+w3Lv=vV*TyXEr+?z9RMy|7;D{9MTHc!iPzG#p_X1ugj$fw%xMgKKe z-4{r$QD5^!FmhdZ`RRo@Hm+isb9cyS-8^<<s@KN6i;U-M+NW$d#?Ee7d)SfJ?n>+j zshN{^ZO#f5;Xl==SUHXJ`iURQ45jAp{a4^oD8iYcxQu&;sqE>imUAV%n5H|u?{OBB zmN_!J++pT|XpSCrKmKi#Tops_F{i)zXv8~nUgJ5JyK&EKS(vN4)jfLC_WNvi-~PSq zQg&hX`sFS&x6QKYW<UJLe|7lg;}cGcMe{5>xh~<c+CuTU^J`bHO!*u={e!KffIq8r z^)9<}^Si9<E55RV0{7tYn;f!?3=E;n_yQN6>55ZJN;31(i$SH2YoE8!Ap;)g?{kzM z+p27Ma9QzfA2$O#$J)koDrIVBA{MrZM}sa+cKiKq>Y~b|i4#vAE_!%C;j2wg$>#z) z*=NxPiyGSmW?ncf&AO{>+k7d<t6rTexmC_SdAM$tboSEy$y}Qr^n`5Io-VTGRMBSP z>5Ag-0+)mZZ_$2!&8&6eg0++7pQS8QOMW@=%Hea3e}6K^9aes&8Pj)ar_gx;+nhV^ z#Bcwf`VCwWcCiG8EM;I|IM0C3kNmI%R9sS-lM43X+0b17+Xe#v_QebStoC&5SeV7Q zQDIBoirE@}8dev3>d4&wY&qrA>-626+O>k~FMV6uIsbF)yy|(Ezn*?st326yi^`T$ zH~!t5pt@p*Udt+@vf^4pi!@Hl09KE+2FctiWmTVEp007pQ=BL?e|?6@`G~i3B$sM_ zl#*Uxn;Y;mVr4G#Ir-eF=eFwhT$Smp`RTTd{qV!UTMYqg^f%Y<2wWj}C+pJE+mCm7 zmPT^*ZJgqNU;fy3w{NUc|AdryZ`{7dQ)A2Qccw)oPw?c1B@Y)DF5S{P^UK{TyEjL^ zM`hIYscyLxXmKlkn|0apA3H9b3N(^uEL1cq(@e{#tz0`vGb+$Xe${!Aw;YNStd$uh zXJ;&*tf}Gk>An4=%y}`1SG_ho6jPtSHv8eE;yui)q7er#efSx@<y8C*=9wFtRbBY| z4<z<K+0y435Zb>`>W<I+O_eLyf2ueb2Ufh=D$%2^X!usj|N2vd*ULZ5=yp0^tUuvu z(afA(Pp-ax*><V$#Gm7L4^>$6M6A%6VfCSNVWHrcPv=g(nB2tnJI>1D+mGK$yXM=S ze^I}2a`wvH`&?W6{ufVQd{6)D6KSUiMd@W<DxR1yA9}R>R@#h1)z=?i|Mz#bo|yI8 zJ+3B`r?R9+znNliz%{3)t&TC_Lb61M)5{CLYRe5$)|=k9-g)@9-lw?N;XJnYl3skQ zuBp6Ibaw`Wn&PiDrgMYpmriwh7%F`&b<SnJk3RPD8%3Yae^Ix?_xpCUQq2>&CL#xO z_u9xVpK7}0?EhH3y;HOJ)Xtn@yDeAm_eb}#|60Rq|Ch#9T~yC~e0*ZrnYtO@!q5DB z$0>UH{+zfy4fj@Ej}WcczwnySUxT%FhyMNR_4~&F%AOf6v$*P*7#M;%K}na9$(|Wk z=S>CLgHUML(g<Q=>B@02D1c-b7#SoO8hRPlACJ7k$Z%V70f>h21H2iTL>NTi9TVgR zI;dj;HULa?GNN=-&^02rK0)nwupTfmn+dZukFFcJX#^^Kz#75CcV;Bru=XmtcI2i6 zsQm=i2_|N+;%`l%n}l5XpgLqZ8`LD^h6uWL<l+@nLxJ4@CiZhcwWHKm==zb1E>!Df txS;xxi)3`|$jJv(zJYCq5Zo~B@X{{8o0SbD$H~CSkif*iu$vde0{~d}!bSi9 literal 0 HcmV?d00001 diff --git a/tests/data/malformed_content_types.docx b/tests/data/malformed_content_types.docx index 43ac7437618f8f49e52c2006526efa087cb0c011..cc5caf3515b228391273bd7f0ec615ab0bad915a 100644 GIT binary patch delta 505 zcmZ3iuw0=&z?+#xgqeYXgW>yDN8io!9GChrGBEJ5GB5};Ffc?r=jWBA=9R>UR2HNb z$Ldw&=4=hz?RVHfWbVmum$<-3hYkp+zGWzWd7!VS_;S<{{-3)R{;gi;DaH9K!1cCf z=eeEFrxtCyn0u?yJ=OQ9+$T=&^vg?XZd)CgV$dua-THv5{#@WX2ktX#nVmh7_lh;z z{heg3Xn$1b$K#YwN0yp0`kZjOlCak!QG46BWfCSvYnN=#m_5OBHTP{3uR|Yh#QrGm z`8`=&`}!=M>U#dQ93kxCr+WY0)%@?bJ!jUPsq0Nz!cCW)uWjAobhM>-?Iin2dyK!% zm$-Ss_J3-%Kg%?|7XIS;g`Um(g?nr!P54+}{k8P<!aP3dU27Jeo1?ynpX*z1=N<(S zOV8(mPsGiqUOOB)ZDNJ$`OB~CBIEgm`yUHDKUM55zDk!daHi@><ras0_fKyw%y`nK zDCO<+XmRUdO}8J%4BcF~Q~BH_HGg|PGmA0iY0Kqyuk-Hjm7KKX$wlGio=l&3CvW7C z+B}s}g@+d<1|}clH4$TQy~TJU@(Lp=Jz0QHhp~RLEuRJB#K~oRwv2BlZ{f3J>SLX} rj8$&35Wg1V+{w25c8psm*YMjj3QgX_Z^!s;@^5}6wvD_D3=9kaP=VK5 delta 502 zcmZ3kuvno!z?+#xgqeYXgW=5_JKyzYYo2&9GB9wnGB5};Ffc?r=jWBA=9R>UR2HNb z$Ldw&=4=f-?bqxe5_>v)!cD0euMV*`mMzSFduXp@ySMek%9M8^xA%YV3lQ3xD-|8) zaBk;w*PS<BnY}%*c+;|^@}E6RyDNkCy^~4snsKmet#biu{d0}$3s|3BYgAmaF|ONT ze%;Zz6XcUv|J0dPKe#lDK`liwYeU?U4WhRzf_bE7hFyx@;(KDr)z(}o)x?T7`hP5s z{XIE(s<iL4z5Cn4SXMP>KXv>6Zc6>~Xfyx3Q{qyGR!dzvzRxLUVbY<UVJGuF@6G%b z-jnlU-v14IRhhh}A8JppU+CGqU$Dn!(u9xo)n7|rFU;eM-nC}oxjE{K___W`30Hb* z6iqs(d_q4@Yi;{x9p#T+^Oj%#yJer8n%r^4b5o!9>4iiyEi%%c<j&D}ukXpt#fB$a z6s5eK9_?*Ctm#(qNJ4p0t5N&nH=4rg2RCFtJYoNGqoKp3DK=G)Y&`cd1b9yt;gZ-q zlTn3-7bW;7pW-zUYnUF~_9Ws8V}tF-W-txrPZr_RVXU6)$Y;UWJGqL_mht7}9ej37 vU96K=vC2&r<JV%GIoXllj&bAU27Y@+zR3sp?HE5z{?D((ww9NHfq?-4o{io# diff --git a/tests/data/no_content_types.docx b/tests/data/no_content_types.docx new file mode 100644 index 0000000000000000000000000000000000000000..d0e0330fd236d7752b4c3660f70c5c1e33b601ab GIT binary patch literal 3651 zcmWIWW@Zs#;Nak3$my~3Wk3Sl3=9nMMX5Q(`g$O8?Id4sW=Ed3_kTs5-`&m8?l^Jo zo03z$(pUDDnQz`*ILB_Bx%U4(N3LA~T?>xysr>vw=2Gste+&HPbay)*UX|%sXmD|t zK~}|WTYdS2%lf;*?Iv9Dni-?D>R!vDL?O@qj-x(rR+<<Vu9&Tu<{xz2CP``Y@`;6a zw|$>cZ?s^(;nizdC!%*VZT;=>$K0$+Y?DfhT3Xz)nDv`2^hEtCQ!XYfu3TC8Gs|Qu zPep9T-|B@HJm)?Ho4B8MTvz!q(jsEp_Nf|6qut8?XPV^KoZfRh=VIKW?z4*y-<19` zKfs%v<HZF=<(mu)47N-R47fu?fPsM_B|kZ!D8HasKe3=duOc^R?nGa2W<!Ct_y4+7 zzufYTO4Xd?`;pn7m3!T_=+2WR=9_*h%$)Y^zI1e?c>le7>GvMaJ=vn)zfGS{VDiUx zYMUobWSaBo){bpz>(5>;aqMcjlzFq7-PF=i$-D6A^^5Es849A72baz&>3BHp(h{o; zMpJK=-MslTLp7~O_v`MtfgfI6z1QSymA+<CflGnZ7ZaYwCzC3w5BzV-TEFkRcm7$; zCpHJAH!Pga(R4ZfX2q1+WF3xar<p(O7N}aR@~pdK>d%f3HFA!tCru5toWG{$w*2<@ z-C5ChMHaW^)*VvUSke<|w)6h%IREE!`X2v?+BV@x&*kn1((+$t>NnY!7<sk+v_E3H zTv^Iey7{j61g?KydR|rfP05|I@VBH{K-<F$kGMf$YZG_GPmYm+;U6QuuoXlO+vNPB zR8Zjd2F3axG7zc#9<Eb+f0v=*jSUl5JoFS2R*<@%RV{gSYxb=}5ANU2%~|m2!EM{` zbLRcN`tps!=>+~`U0$ggok=1MNuuJbPF$!m<yY6((5K~h+RL%6FX5uj{+UL;5vdQ3 zhDh}YCotzP(Mhy&j(%=(@Wv)xk1XZ&0rT#JxCK35sJUU;n{KfN_rjWXRf|(A7w<6S z`TC&4Z`vH*X;tZ`t~D6*sCopPd!*v^A@cPBkLs_Er(-mh7P#|<FSWH?!5_wRXxF;M zR=G>}{b$*)J8$~i6$=B-u98?#Q<c?Xz<h;~e-Go!Q(w<y|N60I*5Vow^~nbN_%ByX zy)*gCw4Te~HUE9xXZYsPsVBVEY<ABQW(wZ=DP8uYzvZ}U=+SffHteG7rms!$FkP;* zvXs02OE1H}f-iF6MYZmqru_czk!&Ot-*ZKJ%T(SsuAjeuTB2)pU|II=-#=KM?V8B; zAXz<zGj<pM|3An7NP{A1+iIcKBt`~?+f4W(NQQxdp*+7RMITZoq~s@;=BDPAfT9Uf zK1}rGI%FWw`hA|#nS0AZ1tu=bb1BgN%3wHm+d(tVlNwdk%w16$j}ATkQ~TafKR*6< zU+e|;+};Kq&&Gv;+b%5=EYR3~S#?df=f2~9K{1?b%NmbL3$tu-$vavZp!>~8<=Vo| z$+v#R1?kito*SZmQ(^wnmrcL@*+iJtZih1cS$C>os(JhV4Y{$|t<tN{*-A-Ge9)LD z+vYK4pQF%+6_Zae+UeQ<US5;SJN46zhqn)|{(APvpY`?*i)K81E<SJW@>b7t><`7t zLBa7;-{8Pe1_lOUyukrVui)T74v3{8zWKKec=o;zm)NDg?fRk<Jyv^Jq{|`%6>jne zq}&u!>tu30ezxxKk{_$H8x<q;m#uW)=YDCC{QS5czjH!+nW8t<%-!e`#W;17@5ws3 z{p-whJ|4+EY8nx_sr$>D%-=6J>T3zioj7gMfrMH{)z=rq;<j8W*l*;eP_ZIY^?*pg zb*~c_HyAQ|UlZD?<a>OHsYNDp7_*e$!J}%jL1l}>ep#ND3z{-3c*XVYr+#+6{iFYP z^Tptp`tB_L2ul{lz5LAvRToTSZq8IYpO<0r>5Sw%v8P;>|GhrFcPeAvVeqSbO0neY zfMXVWwY?vl-fUjeKFxaP22=Uu-)$FmcAo8s-eoFNU#jPKrd7WxDC*+VTR(zA>i%zB zb?@Ap|34o6HC*)LUW7y4r9?rdK%v#iw>NOBuh6@=>veK}c=3YnX`b<OcIw?MeiOCX zL`j1A($PN7AAGO5Z%N&Bo8NX>(M~CjODb6}Z`#IBt>5-cY@U$Kx1U*PUCq4uPUpAX zW_#>&-kAj-;O~1Fa(Y(Mg}b#qj#sy=zZ3S!t1b2HBIgHUOV}1jS~3dGV9l|TIvDtB zr}6QHS8lQV^5Qb>SBdMgW%8}P67zc_CztM9k+!nKr~G#ud#rirrBO%8FP@j5@^AfP z1f`4GX7?{Q85tNJu;9xkLa=O-mY-J=l9-eOF38pfIQktn5ZLpX>vZ|6E$l6emWc{I zYN=ZAlrcK}a>S<V6R)d$`M$$XTk*ug1^3>}y*cw{<U0GgqPA>i^Rz7Iiv}rV#!Fj; ze5(Ck^k0M3eSy>(^)*ifBiDtOpI(?_<0_UpcZZDD&0{yFdTq?R$aub{eaePo?CgfM zhaGwCuEc(jnmKvb=Bz*w{!@*LmD4z{pZKxNP-_0(e+3?eBAgkD%eZ%#%AUSzIak7q zX}Z(<9%nIWnIp5y9cC_w=IBxP<KH&PRWbA)bNZW)M!YlUHJ)?18~4nXg}J(0-J>^c zzt48}?cd8TWfx|zU+yw<+bo-I_QQYtSBGytKH;=jG|#e=>k=NTEfk+SzjpP?l+V%A zKiEnN__In^@3K2Lzst(L;wvjCa1S28$sx<gz!1ueFL2?Rt~j-%Br`9)7*zVW_IV2( zGT?FkK1b=Xt;&W6mlfakaWk-UtZh7}Ql@4mVqu$jH0aV~x8LukE~-qLIPv7+qK5|* zzS{JZd@it)eHLx7sIg68=7q!3th>s#&6je#>eacDTjlJNhwElZXD{8K%(dx3PsnEN z=^|TB6>S!tt|<O4a7kG37VYQP%vvWdSUXw%S;{iC<d+k#96s0h_a}4QVdYnvF@2|Y z3Y{0Q&AIbV{PzE;-@p}N7fWEsQU(Tw^9=a>$PY_E#U+(FsbC+T4bAnxZ6NS(U%cSY zYEQ?Gg;|Un6}IH9n62@rVRf;mj?C@PmQy~xPT$R`T`Rc$(zm6Z^FPPVtDblH>*<%Z z%9E|PsBAfP<KMjrsw;NrwX8BKE3P%PNaM5&VD(sQkj$-8R`u!S=^B?j#fd`m*JqfV zk9a#ra;fG=Dd`2axdA^TR^~FFlh2)cZmVw3RhiD3pKi<84?hgN)ex{oe{=ngz!idb zvMw#X{dkvWX(U(Q#wq^y<&SN5`^GBuPe_UP#_el7HMY!tXIez^1W#^Q@^EqC(k-ns zzuc{|dvoM_R7PE&>Xu7^7PsQJS(h#UvE#z2KqGm^LPeu8&9sc#%C(a;qXLcOSDhDm z%b_^ITA5LDcE<9_ni^i8-rG;goEMXL)oa5;G4=UtvmZ_>-owl)8gcN_ho8|~PQ~wF zp1HAE)rG(RKw|%sEq$H=q5TV`?)c2#RJnrvr;3AdV8yGg5<TjQhHs_(uRk?-z5K(B zZm09b`V+1e&CJ>L<m&5}ZI=p9{5gL2P=z&5#0s4mRv$VS77Bj(bneuP$xUp(<E$*c z{rIi4Yrft27xfz_XRpk?&$Y$xfARFi_w>I$k#>qulwS6w;)x0Kp-0ParOh~0ef{zE ze}7l&iCM4R<7zT_Doc9wn<)kdTytvL>KGF)BujKSy}a<Nw%j0Pz3F}HorizxeTsV> z&SQHo>BYzDn#wChcV{rDDgIhxIyb0(=~SnOq0-k<=UnFd=wmOxQS|Bj7j-*)zi&4y z)jW}FB62WyuZ`^Tsis@b{*Tq$J2i_>?aV2*+j8}Oe{?VVuQj~(e`#FRMfKdr$0wGZ zshjaF{LH_1oT8`i&xza9aBtQ12+@lD3$F?NHCStR=-<Cyzkdv%?3v**i>r=_fgzX^ zlyn)HL>Lf#2IRIgsLz1Zg+T2=1bCxrL~a;@n$`%7LX4OVZFJqptrk$ZfY6=CgrpnR zltkB#T+4%+HVEzfEchEb=q4f83#bl}V}+W8+(tmxj$HDADi4H1%-EsYQK}Mj{m7*X vs`ZmOp!$(ZOLXnXF$*es5Z0G-!nDH+q5yAJHjn}i1`dYbj0_A$JRlwbz+LW0 literal 0 HcmV?d00001 diff --git a/tests/test_corrupted_files.py b/tests/test_corrupted_files.py index 4ac2678..8d7c252 100644 --- a/tests/test_corrupted_files.py +++ b/tests/test_corrupted_files.py @@ -86,14 +86,26 @@ class TestExplicitelyUnsupportedFiles(unittest.TestCase): os.remove('./tests/data/clean.py') -class TestCorruptedContentTypesOffice(unittest.TestCase): - def test_office(self): +class TestWrongContentTypesFileOffice(unittest.TestCase): + def test_office_incomplete(self): shutil.copy('./tests/data/malformed_content_types.docx', './tests/data/clean.docx') p = office.MSOfficeParser('./tests/data/clean.docx') self.assertIsNotNone(p) self.assertFalse(p.remove_all()) os.remove('./tests/data/clean.docx') + def test_office_broken(self): + shutil.copy('./tests/data/broken_xml_content_types.docx', './tests/data/clean.docx') + with self.assertRaises(ValueError): + office.MSOfficeParser('./tests/data/clean.docx') + os.remove('./tests/data/clean.docx') + + def test_office_absent(self): + shutil.copy('./tests/data/no_content_types.docx', './tests/data/clean.docx') + with self.assertRaises(ValueError): + office.MSOfficeParser('./tests/data/clean.docx') + os.remove('./tests/data/clean.docx') + class TestCorruptedFiles(unittest.TestCase): def test_pdf(self): shutil.copy('./tests/data/dirty.png', './tests/data/clean.png') -- GitLab