From 12b3b39d4d5520af04233578ec93138eb192621e Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Sat, 31 Mar 2018 21:20:21 +0200 Subject: [PATCH] Add support for .odt --- src/libreoffice.py | 54 ++++++++++++++++++++++++++++++++++++++++++ tests/data/dirty.odt | Bin 0 -> 14114 bytes tests/test_libmat2.py | 26 +++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 src/libreoffice.py create mode 100644 tests/data/dirty.odt diff --git a/src/libreoffice.py b/src/libreoffice.py new file mode 100644 index 0000000..b7e0dfb --- /dev/null +++ b/src/libreoffice.py @@ -0,0 +1,54 @@ +import re +import subprocess +import json +import zipfile +import tempfile +import shutil +import os + +from . import abstract, parser_factory + +class LibreOfficeParser(abstract.AbstractParser): + mimetypes = { + 'application/vnd.oasis.opendocument.text', + } + + def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ + metadata = {} + zipin = zipfile.ZipFile(self.filename) + for item in zipin.namelist(): + if item == 'meta.xml': + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)</\1>", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' + zipin.close() + return metadata + + def remove_all(self): + zin = zipfile.ZipFile(self.filename, 'r') + zout = zipfile.ZipFile(self.output_filename, 'w') + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': + continue # `is_dir` is added in Python3.6 + elif item.filename == 'meta.xml': + continue # don't keep metadata files + + zin.extract(member=item, path=temp_folder) + tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) + if tmp_parser is None: + print("%s isn't supported" % item.filename) + continue + tmp_parser.remove_all() + zout.write(tmp_parser.output_filename, item.filename) + shutil.rmtree(temp_folder) + zout.close() + zin.close() + return True diff --git a/tests/data/dirty.odt b/tests/data/dirty.odt new file mode 100644 index 0000000000000000000000000000000000000000..926ebff39ef53a4d3cc5caeccb5118383482e8a4 GIT binary patch literal 14114 zcmWIWW@Zs#VBlb2;1K;~7kA8vN1cIz0fadi7#MOhb5lzy3sMsc3UV@&6H7Al^YqK| zQuOi@i!+P$@(WV)Qu32ab5rw5^h#1IN>B}Jb=aL(D#*Y9!nzC$3;~(RC8b5F#rlQ@ z$iUbL!ZS8Bbagg0HZ*b3E67Xl4DfU3<&xrJU|`_&^l%AcU|=v}U|=xhU}9ik5I>pg z!N9=4S>O>_%)lV@o`FGV24ggjs{{jsz+6uk$B>A_TO+drg2S)hv$SkfDH3FD)KKb3 zaM`IP-hDJ{sm9eT*38vBU4ErrE5+toof2`h*5djalp3IOPh$~}k=T0;Caolc<_<O% z&IOGQY(>oihb+~9|K9j(UeKHDx8|SU$<HjlU2R{zcI*7FYh?M3Nhr1m9Fi!ukm)fz zY!IU-zRpsH@0WkFy}|6WERF|Eoid)7FTQoZtnvN*k6emJ4oYlZY|L}`VF5?ei}&x> zuU`H7#R~_HizdF8w@jTX>U84Pty}l+*FQSad7=C8!#%xqDhiu#@@zgd<Lue9Y|RG~ z3=DXT)6UG`6jt+@Yc=)!^Vw(9S_B*g-UP^$F&RvXTfe{L<s`r5+?<>rKYg0ywUp;D zN9&>2*VkKHSO~Bzy#CtO)^_otMaTMNPaB-M&0Q-k<9k_DR8&`2*U!&S#MO76P34&x zhEZ$96tCR8nVFa<C@7ein##)XFJkS@oHWDG(9o$<rsUk&aq;BI%-h>?b#--ld3lpI zR@~W9czKy`_0LbKB_$^P$7h>lP6}FSBGt>*tf;BU=_bL--4hZYfB*93#X&27)YQet z-`|{mUP@Ye?wmOxT2l`ubaZ!r{`9G+v~+L5L#L9GEfG5Zge9eYjvhT4w6f%QpKNPu zYh`8S+O=yB9y~bPEVrnjAR{kNjH}gZ?zwyS=AD0j`{qqUp5mXMp8DJUOgY%ZdZBj8 z+y@U2xASvzb8pYTpJyUvD%Bge-duvmYHr`@r>CBNs{Q?qlch;m-S5Zm-_M^tJ9g~Y zuiw89S9!dM4-O8Fjh#Dn>eOk|rp=q@Hz~!}xA$hwzS7s%uB;5Mt*JTIEA4){q`G>y z+hRir9u-eDH8sC^Hj>Fc0!K9Roz^7u*#CII%+B6^>sC~g!j71A4hkPWe0cD&A~!de zt=Um+^3!L}jvYCo6TR)tqodvS|9&i9ym+yPiqY(|9LY8|Hgo38@d*rZb7OPTP*6xn zNO*8#WAbwU`O7k;%(&$)KDd@+Ce73K?_XWlq6cDj%QB_x3}*T`J3GHQzIYcuFYnW5 z&+hH5Hdj|?SKLzfw`yn1yjioj?kNj8ndjf*;p0>D+_Y(vkfMmB<j)Teo!#A!zj%>x zqm9L}Av<H%vyT;0y&&CIbNkLepRAH;B2{S<S5fifNT;x7*Q1XWMl;{MyuAF{wQDSn zJiNS?mX>E{8n>%>Uc1Y4_(R1WUTL!@PoMhEG;&oCh}mEF_sYs(Jw3gCd3!!SJ~xTS z&FuU}Ji$RhO#-uQtIMLJZy#)CH#{cd+8d)S;jA?^EiG+d-QQa|X8WtYdMQmbF)^{J z|7WAF{`}e5*%FVt#r5Y|7P~FKytm@xqJ8_~mS5hwWy_PNPrrWo(x7@Et>EIti-7?F z1z%nSwp{r5_;_`7wf{VuPD3>-YwP2EvfctMTeq67d3<kg_2*}2nUxaFEv1`8MMa~x z=S}rm+Su4QZ{9pdM@Kt5yN@3~{`>b&#C2`NMkPhXhex}`rOorceE$6T>sMYLo+(~S zH*Vahs;a6rwM&6x+ER(OnKqT1_U^S6P;nG^e7wK?>*8Izc4>#NOXyKnQW9`t=aaFh z{<g+#F{5(tmf+afSU0z02M#!dgot>*)6wT(X<D>MsnezC<D;WTjvRUNB*ov~e_;Se ze)zhWl>r)k^7eKX7AN|64%=0IdGYG%YNv$-$9g0s9)Es*{`;3N0&a^Zo`0^~^5@SV z0jI33vra$Vba-Rgq|;ATRaH4TIX^!?zrXCQRR8hEMLT2k#3k-rx^yYSWL^CJdq<C^ zUR@QMw9&%U^y};O`^$=onzkQ3dUUpVJ|927zK#w{nDowh=gyrwc<|uM%gdXK+1S`P zzHN0{=)m#y`*-IV%xar&o_ShSU0q#VRJ8B!x7$B{{80Cw$HBtXwJ2j&nnYWo#O9kh zMl%JRrcImX?Ci|3&~vgHFE8)UU%#Yy*ebjyr4$x^oT?pu=kDG7ySt7)ElT8Snc%jV zaW@Yi-yz}2CtW`L{L|RT*m7a{{JJgyDJiL-l_B}}_Be6`int22989>ezyANTv$HQ> zy0k8S|GB3{y>7~m0+*Nhc6W6}?I>6{eR_CFNl9*QZeiiaW_Er7r_<APC!c=G)tYpE zo-MbS&V!E?28CW;U2*HprFwmReN{Z2^uktOHJH18y}r|l43kMJp0{q@`u6?1a?6q+ zO{W!m_Sk4@u3WGn;rY3_)22<+n)>R)!^5-9a;=J<c$AfuB_urP64jnFcP=wCb8StH zg^XWz_Uhi=-sWcJA5qIM&$TM`(h)nIvZ<woMQ^&cQ0KLEvC_iA#Sag)#_ldVdgRER zeYLx{Zk@VlMdaqRsZ*y$Mn*<PL^S*sa5{YWu$h_JlFgfo<KyL>H1zcJtgUzN-o1O( zDlH?UPahs01|>HJe_vl+9UYtEXFf48GSV;i|NmE=nfY>U^!8V;UOn)Pikh{1cXdwA zo4MBIDnchGsd`_@Dy^=zo_^X?g6H+MwcR~EJlx#fU0tV6o-~~4^Y!ajIXSu9+}z#e z@7*|BTU!-dc-Wd3FIGMzA#YjaqBePDh?cy({EP)#wrp9t)OBS@T577LwY9aS<;kZ- zHzv6)7EGQqdGhAu<9zn^`)ACM=-KGCG-&nJwQJYToH<i%vZvm3d7FxWwPCw=?W+6# z_q(jDZ2I|mth-jNT)A<}78edz7ndv7ughDNoVb5qUNI#-eR+V!s#UAb&9!bfbldRR z*x1<cSbW`2RVNOgSFc`4$;$F-tX{QBVdjBpbw58neRXwp{QkPyA0HBz2L1f<^75-! zum1e`^XysL+OX=cuR<LH!o$<^^Y34|a^?2z>W7C|JuD<1fB909pqZVWJ)xXq;g25` z%?Bq;o3<=yrNJ?dv{NTeOz=?QYE?2b6H{cdmKQqu>+5U9e5Q6op3fg1I=}F_Y~mW| zwRF?=?c3L^$+^F;R^svB-`};Tdi4~hrltl31-ZMk&t_<AZ#O)4_Uzed)27Y&Z_?<n zq3mr`PobmZ!KL2Q|1Aif;j{d5?(J>9OM~`SebrJ`O?`Bv(`n%Z1rE9X>lZFCWVi*b zc3Sx2{d;x+M@L67F|lpiwz0CZN<7}SZQHkRWj%%d{^u9DcF&nRSJ3I@%a<3#i}(Vz z&z^mIPvvJm8H)t7+48nkTgu<Zl_?!r1}e05qqmimlzjRA{r>)Xds|yu8yg!dt68F~ z7aly`w8>~~821J)Zf<8Mr#*Z3mRjBY@WJ4)fr#tDbXQl`8asO{t6j^Mv9UFV+SdJv zP!O17Umv&nsu54|?{9BytG`L`u$eIyl)t^j`ZDv+kB><kf4sQ3IBTnwiAl-HNvcx4 zX~%jb85tQB&NjSu5D5tlJv-N0Tv0?wXwk)tKY#w%{rmB_v5|4|$tMpUFbH-Ue*5;# zOsaS9-n~i_PaHqqzDQ$|%GtEd-QC=d0!2ITT)MPr$r2X@ft(u~7`GkD(B<cBT=sId zdA?ocr<UW7Jyj+tyZ7zbxpU(ti4!aL?AfzvlaZ~htO@^Y^<BGnXU-}sEKJ-g;3%+m z?b`BpcO+$GcsMyZd3jTxGWQ<`h1{#FtN;G_qobqa#_5?C6(x1Y;i03zkuNVVKR-WT zUPMG?@<}CS<(sY>j$SUmUu&LuX~~BV1v^q5qa!14-nenV)Fxfx*u8uIzP`Th>+2gE z8{6C4d+yx1ZQHgrL@dAj@@v(F)r-3{Ci$M!F`Bi^@lNK}D6@2Dl}{CW?(QzvkJ@tL z?p;|C`C}4YOcI3!1snG5i(7orXP(W?+qb22!~_Kk-`v>P)59Y!K0W*TI$j>0it1|r znMOyCA6K`uym|I)?zcBLC)7k9F1~!}lABVp?bh=5_jVRPKR3tna*7dS@C2307Z()w z#_cbD?&opU%*-su_u#{VFIBZ}Zf!v;OG-*|GBPyu^}m1k&~S(8SFnoEL9;h+-h^m~ zXia^!di}mn-@m7OpW8pBNIBGOLqpv9`&%-F1*~%B-@0|Hqod=<5to3qBCZ?r&!!ol z_<8A4(8h?EhzO257jN9yQUCwnr%y!^e=aU|pRN~s>iqfhXV0c~w#_XJ+3C=ywZuZ^ z+<^lQUS6k8P1XKc^RHLh{M6~wQ@x&U$-Lau)Z}n5smF5?i!hVdj%{1FigvmvDk=&J z3d+gJ{r>j$_P*NRz0&5ZR;@a8=ulI0v$K;^#;kLvPBG;=T{dYy+$bTz+AI@#XQIl_ z*X#F3MMVXPOr0}_XKHX@;Ka3S*SaX3JbO0vsELbnWTd3$ByMp%li6pBpPkXv(t7p& z{q`+eVzy*l>=afH3JMa_kK;MUJlTMUx%5%I*YE9GMMYCq3OFfET(Nui?yXx(pPiZ6 z+1dH><;xA_EKH1^|9;vwaP#tR-L`Gr%9VnhE=~&rG(>LQym@zT^>hV}PM1sf?!9~Z z)HLmk#0KsD<JPsmOs4Re`0kJW@~<|hx5r??%jf6kuaDa+HE;FRS4+L8+f{$dNzrS% z|L@<w_`090MplOnCZzPZJw9-NVYaJA*Qs;o?(HhozSGCGEU;ICRj@8G!z{hoYV+pJ z(Oa`ZOG{06R4;kGcFh_WrHMXj&Cvos#Kgo7=$rU@FTIq%@8_~jn}n8}s<-y~d08~y zTJVs-pVZV;QyUwe9|BHta&k+TE<Jnp?16|i=7z^kPuHJ*`sv)cb8Y1KS21p|a_Tt# z`0Omx*H2DPK0RGuUsE%&@?CA>#s?8%GYt}%Tmz%liY1(#X{@fNmv^|0H#Ro5q0Q9T zQDEc7jqH3f5>ir0uazcT^!4?9`}S?sUb*C$=xFZSta~?4d|z?gpr^I9b?@HV<mBXx z3=Nf;H*Q2sQkkw7J8S-YeX;I`iYmVw8W`sKZBIQdW^QgC9UVRS<dTaSSF%dW%ggig z--C*S-)}a@?k;P5d;hJ=m-|h9swaz|pG!?iaX90&#mv<7=+UE&9DP0UJFdO{`u+X= z{WU*}Dl03~)74uvk6CDIuU@_SwS4nm{{Q<P%YM0@Ve<OgT4ulY&QPsT7ndXX`~R|O zO_^Kqq>y*@d4prk&CNWA|9rXZU%+$FuHOCe9_23y9D38MKRxlBq#`UN)R1E#)91F> z)6dVY_?eHZt7}Dth0HD|X(_3y4-XDrUhY48)+`aOR=33;zkDevD*E*GYwYH<v-jm> z|8cALSvB9-Rr-2$`1*)rOq)~F)0tOKoi=TgPnv;5QE_ql*;!L3P85`v-=B6?>f49s z`IqnSFH@SWXK?K8+qWxMuGFz@x^DO9!{J4X7M-{MZ)0E(urg%Rs_AEsJbn7q{{J6i z>FMwJyB+>tak#K^*RHycN5y~s{CW8B;WbL494rBWUP~|OMsMS2b?REwP@OsJ-K$r> zUM%ka_3PJ`C|%D<Yu^96*<61*da*^1t*mJG(TNi$ii(P^QQKGal*>^-S6A20&Q4ZV zc7bqX*p{f=($dn@)TMsQrLD_!er@O7?=HRb|JEp7k<PTulCSJKHtZ~Z&c?!WBE@J$ z{^CW8($37-Soqkj)#>23TKVsL!*llb=7gkPyLN3`#m7fy&iGigdwF?j>FKqxa<@8t z>v-^&@vn7+GXIQ+Oef6rL4f~;kfMmFD62aQQzJvl`DS*0D{JdgtGoY?MAdimzhJoY z<uJefk)uZs&Xit#Okvmcwb9qFUpN0`^Sz!aTf}09kJ{v|Teq^ZvpcuVvnpM+d$+aI ziMw}Wl_vH~$#{R+;=P!}`+M5T|93Z6d9Dq6{`4uRfaYO~kB@J4O3cod786U$&HdZ| z|I^=A#((!W-22y`x8zUN@%w4-FaP6rxaw{ieQ^CJ1_p+@s*s41pu}>8f};Gi%$!t( zlFEWqh0KDIWCn(cIkr6qxf%?3oZtWLs&$Iqonu}7>DfX10zMYUkN^40xYPoiDtLWA z8@0W*yIq$f66d;9ady<kQ_ub_^$ZKslslVr#&3c7jTuKPQ#n@M=ke^~3h0WMI(gy3 zmH*5B+<v8e<_kkGql`vIZgw350|SGntDnm{r-UZR_zedK2Y4_DBm=|T3=9m(`FSOv zk)Dd&oVDT6`M1qP>h7->Fv+>maB@@9^4u*qY9p?vIxyvUx99cqczPbGxae_e`oCYY zlP3A7m#tDd-7F(l_x!^LYw=0f^UIsmrC0F=7flu0TsbSCQ#5f?+|v5Giurpz@2}gx zD=hS=fo|uaxeG3(MR;=W=gja9Zo3(JD>G6nWrn7z(3L4-(^oAmx}cR@^71;DQ1bVh zO}p5S-<tOBO-O3H?Il<K%i-T1M9j|EGI!OR4Gvjr;#RIaWqo+F7vt+iuG4HkB^{sn zCw;2fwQZ^M^0(i<S!Ni#>s<Hh)vNEE@8f$cc)e^^wa7usr&n~-J|%@cVGaxEUiRzx zhK=5vg!x1Km$5~DQCXE1?0>Gqx3*2{j)JYFxy=uYJ={jNd$@&ObEV8SHmsa{|KO7U zckap+Id-qiwtOQP8nL<TlNqzswQ#TAbylqb3+~<R{#i4#y|Cq;9ryjLw3U4ieneWh z$@KSV7j~H(5Vce=+EtiorM6G`(44>>Ck}t%FnhA*<^Feh?+cw-tr~7VKPWl*m{Mhe z+irz9cVDKi%Ur!hwsYk&bDxfDM_)uIv)aeM%PVWRHfzG<fP(ai&M%!Grb@eQkC*>? z`1$9DTHhBK7D_&Kp4xMx&bs7!(AhhF0_B|(IGL9?7OmLUv!%a!k=X9cXE9D1LAKj| ztgzT%$X(sl%wPW`Pp{aW$z*Tc(ia*m$7~yyUiFYl7nkap{VLaS%9%6n`o-(279ZsL z66j%nIG68VY(~)8gbC}~gH1vfoOE|&U7i`H=ws|;X#V>31zz_;p<{imp;fD;(k4r* zacW;;sk+}F-19#C<@Ob!&yI75uw?#|T+Ji3?S6a_ze@OXU3;c)&n7EB_ETKFl7&HJ ze;CtAWtY=BZQph{&iWbii|M(7$mAO4J1p1FOq2Yqbe+$o-qB!QX=c8TsQ$;7%2W53 zd;B{)$+~T;SKf=t`+f=k&E-EAzr1@oFn#K3%{i6U$@hN0zu2c0+tdB^-F2z0yoZ~2 zxZb+!{a1n8Yd=eiC;RTbUe}zqFSuotRrR!2`k>{MgU9YBd8LGHefc=jcj3%!E0!Jh zS#0uqNsM)uiRIcioDS>PO#Gh2z_Dy=Q}?70p*3Nvj?CE_U?_6?{WG6`6J%=l`3sl# z>Qt)FC{aJJHMjoq={vnKhfTGFGQAcBuGO4+@ke9%#=72?hTn09{W?WU7w+!U-RN}Z zLfs_J2aLbu4xM@Dzk<_s1-H;$=>>XAoJH<B7uonW)je{Y_}8Mqx2pDS{`<Cnw+{-Z z`Yc<1=twt*v99}XpHi`P2SgriDpYJ}KIq>r^K<Iwb2ly+&*|4U`j+VNXL@8~eSFT7 z*oxeWTmGA#IfrI4w68eY`grwO@54LRO9y}Pc6jEPlwNlK(n7hUos&}zb1$%8^uQzj zOq~9-^)q;y_Sz?kPe1LrA@#`J8P7LA{c|-*z--FZUfxSjv(>-m)i$kBDS2*o_x+LO z$&c<^r7qTtSzDKUqioip*=chBb@hJTU-aYW^3$#xYs|A7m&cjroHV%ji@R_7%wu=U zHFMf|lvtKcWL+|8JD1@3PmLGsN^%uc)xOEC>t@NEv@T!cxz^7WChD~fx12p&QqK3k zQn3Hb6#t(+z?+>z{qVM?#ViaAhxi#7AY~f^v~0U?Yh(DCfq_AUiGhI!G|-)xmzkDY zT%uQ$lD2f>#=K?&5!dp4hl<kvG(~o?resXnnYI4_t8>5s{+}}^{i%)Zy0nNd^y<ZL zS9g9t@$;e8mp5lEUR@Bg5v)Gfvv$L~oahfH%|9~RPYHbY%FR~ZUuTK#$^xw)H-85* zp82fvEM!5p;>k@s--MR*I4doi{q>hr^sSFZNjlr7tWXZSoUkU|@!>1JRU4HWXIXU> znv^L<G3Cr!XEcF>v-zXcG4I`v-z4|f9X%;t)42Y}=4I+{|9pDG^YDpQ@CUB7{BxB` zihh{-{<|jqi|hG^+8<d;S0Af?x$^ayp7MXK=F-hme;S&WEPa(abIVGv|FK)Yo=N=s z$f$}P5;)ije$W7XaY<!PYO!8LZqD7P==HaE2-dw{Z?Glfen#ZCoj3C4)J7~g%XCsK zcaz4iEelj{B`J3@dH$Rfzu$O?#>H(<7C%v`E0AkSK38Dz`E09mP=1}=lbadGmpob& z!FsVms3$_OyG8tV=!f6S>nBux_<Vc+9>LkX{WpyS3#V>&5?<Ob8LT`bX2nv~bxmGk z-j1T-r<<Pn=$7sgQPsY)Wa;_6Cf+v%m2+N{9DZ~D+M5OKes1?B?<}#cNj?1Lyr0Tl z{<%GS8dloZD!2LBt?0>N>2!??I(vSLMarHZd$@wuo=%l`_x<<BIy12%X}9j9N+ws_ z9`1Oktvyq6oeE#%u>wO;x5pYKdn_-Ptf)KjdUZ(1&sirAUI_5j-?OUh_Cdjox4$_E zFAw4C<2_vS>RzPLoxPin%%2@R_uG#YxjKeLKi?nzZ6=mtSEBpGwoz-^9+{kFPrp6q zR+I`YZ1N3Q{I0~PlH+ZDe|7!TTK%1Amy-L}Z#r($Eq%oG9;?&`SH%K>ZHEm??+Eq2 zoBBt>D~xNUw5!=+ZjlAXxtFxBR$iMQ=TW*QdZx<UH4Dz^w1hM}s9T@=W<U4J{MSG9 zoep1V*%D-Yck8j`|64qiBOWPqzSepjTDIYY#*QnK#JHL{gxEZ{>Nq&|@>;QMHE>!h zvV!}vK-L64*G)4|r5Lx#AB^6vnZuI9AuVM*X@f;@{{0&%d2R0B%$@de-Rqw6`l5B5 ziVc_Bgk635&u<or{^_0O^!IM^CT7PUO(A?=kIgXtbwcI-7jL`dHAnmXS{}1(N@X>0 zc;hwk@}ri;PpsPB+zUL}o4wD;(xGlb-T`w5)2+|F_Ihm<d$GRYSm@L~ug!l<t^Q6I z{l6~bqrKg)1Ie0h?<4l-$?m!Ddwhfbp4Wx9nJlA)wk-?|Gk5bVTKhWelF`$`2NkdH zFPSnSBd9oI$p+hqRa4ZG{2lfs&5{W_m?Go)xx@KO>4pC3CA`)WjlA;(LoZ3(;+@2G za(@G($>uv351YP|%@QvE;M6Ol^Sx=icNT8|+p{-2{W^0WnVAH~)l3NqYM$CZMOD#$ z;Z(6594x&4Gd8R-(GGqkW>B?8b@2rSP2O|Y`<ix(Xie{CS(ZNO#aB-UhaJXuq&D3A zy0*~EN_L?|^Lf{m21&oJ=G_Q<&z-Gxl%b7jZnjvztJ;FvOPiW@?H3GFKDxhQ*~Kka z(sULd-L@!NBB@of!!<O6Gw|c@<Hu^2C2Fh9pQ+RHDr`Zz>M50gPnz!pCVx!6`QY&p z>sQmS1imVjc{u&Po@$c$w2jS|zxgCN`~KW{rtQn?d`IpVo_V~UJB~3Qyc)oC;ZpL< zO@~BcCtXxw+|r}BdV#=$?{myw@%QZ5q?HsK*;uX6JT<|5!T%zUT>(m*?RRtvCZ7C} zXjq#g{LCcF+hOzBY55$w=ksfSzv%C=i@DqySbp{Y#EDs9`AqLh)c**6eA6ynochFu zP4%><KF2)Q7zIE58K=CKrt7Tt5W5{$^;IXiPU2?p`zVhyygk+{9t*fg>WaF3%@$V+ zdUIZ^ed1=Fzh@#mx7;wA`)H%k<_}9-|FY)h)VIv>TH(`lFX`c&>48ffY)>pV+pYI> zZ?eh9PrJfpcudUImPAiw3cjBHgN3=?N4<WHYtfsa*Go4|3+6OsSI|DY?Dv<HED!Gp z?zJD+m`eITGj=!Dn0<GL!EVE|-@dGf(=xEVde~%F`q9S=edk?&TEBe3df!x$OKcap z4_<yMpm90Xt$*HWH@lyvj33YLt2=nM`E4zK^o8vbMXCkHB9;o%{A_GxU%kxOk-Ptb zQ1Tb+Uz|M?zi3as<h|$K5|iswzZso1S-9$^8PCxzLhsh)UYftcwt9!`VaW?;9n|^N zryQ)?u;9v-zLVj9OmfP<hGo9^YSk?JS9{C*_V+K+@}9(&elGuXL-+E-`QJ=dookdT z$TBr^T+X%JaZ$gjwX1mFoAzBQ5AVBwFTZ;-DSwWPW4l>N)`JOQn_5f`T}id>vq`92 z=&CB+qkD3$xu@oawysZ23L86ja<qK&$xc5eAXYK4($RKyUE`KU*N|KDr1cs8PhZNC z-XnQ?<C?VX2b-85l<><Yt$!Ql_*it!s;^gykN;3vxFb_maot(ZX*-+sW0@r{Wko;W z&6s|}cv{pp$vY168s8qQ{Ce?PdBy{#{EyFE9$!2D$ZYlMd7YJcem7P(Y<XO2dgg|k z^*6DHe#MQq+MhV|EH7{S=*)kN|E`4n9j3=O=O+HS(fl#l{8<0?13w;~GpKuY@W)E& z$IGJ={#2w(*ndy>bEo&?XI+E3UB*4nZ6(g{Hk@~#>G5;x!xa~sKQ5Afd{oz9-hI(y zf$E0>#rIv*y%2jxeAi>Sn&Wpl`tzF#O?DbjSZ=G}zuR%%KA|VUllp5EO@b?0?taJ( zjh}7C5)^Nl({ka*?$exsxvOhW+`YKwkoe-N3%()qK0e!fVt(@Z+dmexe7civknz5N zH*(?^%@eytOCAYM3~OGOTJ=8gjqr}0x8rOo-zUd@xDpg4abc@zulPoj`}-23pYQ!C zo%uG{-p=B|btA(<mU7lDtmT)LcYJ@+@_*r(#S5BWz6m-P@Xh9+|Iue>cOL9n){`G> zqV&pi-88-jnMx|`%CFv6eps#Uxsy*%Uv~cU^XcvfzFqn_>DrbWagofe*`Du$o}F5} ztJi(fW#?qsd+b}~o3HS6P4%18nr1Z5XP&LhozEJP#tnyk416{p-P7Ee_d#&e<aV|H z>!)mcx7{)ExayMpz2ClkT>p0Vyh_WznMwax|5evdZ=M^gx%fv>T3yS&jrSfbU0^1n zpS)h}Nr<V@tgEjar<8pCc>4Oy?nr&sb4S|m#xv~yD-3FMbvg1pSKwe^aMVFGx*)3y z?tXIS(_vv?0AcXzg4B{kP^-emz9KhAp)9qiI5R)bR>@G$KuIArFF8LYGcVm%DKx}Q z*FwqOj>{%LEiE%S)e5%cKsPtFBvGLvHz%*y3L<5zR9cj01zwzBm6w>CT5MI4Y=yi^ z!3tTm71$z(NJUO&Ubd}LMoCG5mA-y?dAVM>v0i>ry1t>MrKLVtRtaiVN-|tsL1|GA zSXD~0erirCXdOhczM-C>K1?mhg~U1vq!!sp`T6;9i$H#WcoA%!k%57UK7<dmDZMBq zB?qn^$pc0P2FCj7`YDMeiMnN(spZN_3J{%ENC9r7qyRR+s-!3}uQ)BgC>Ole18!bQ zesVEn(T6U`DPTuHOwl*gGt#dp&MDE)Pf0T>N=ehJD9%x`gT)odQ+75W(ke4Avm`Sy zM>n}BHL)bW$S$iaKeIS9Pr*hXt29ItSaD{4o^DEFNvfTZfuW(Ufw``cS%`tLm65rX zi3L<QsydsLWGf5}CRQc}R)%Ia`Y=h5DzH8j$D|}%!2}^rNKMHs$;?aFO(`t`2Z~)l zh^b*P)LslpR6WU+$vLUTc1CEbp~4Wg>8W|CMc@$e$xJFr^#><%ePcaimC(G*3I!y= z_=2MRtkmQZePd&TTtiD0r_#)v6kQ_&nDGdcASS~zN^wbINoH|LX0ie}s7n%)a#D4Z z^GoweY?Ta@AcC2>iRo}bL#SYW639Mur3F}279<uWrWYj^WWW@|j4019N`Z+PL5)q$ zNGwWBE=etdNf;U`>D$>rle87ckq92#7hqvXPX)eSNndf7APWNn2um<9Foa~3<|gGO zX66*@m%x}fRudg#U|?hbuO>>jpC``1z<dL=aHwL=+iM%M=Z4BOJe++ygL$Qt$Luam zJ}Z+)57HLM>aa#@^EtKa{bTfUymRH`#aQvVT>lu)&M3R6DbhW;sbteqS(RBCJ`>tJ z-haJz@#Ckatv2UB%zZfLp-jckk7xUiKV~dR`~RBz!;Xm}hMt0xi6J&-PL}%r@9gsM zXL@sE_W%2H{Jm}Y<IP{setdNEyS|J(9~*z&zF#MQoh+ZX*F0+9|H~7(^Y>bO|M2JA z_4)T}etvqjR)2qV<@Y;>lWmqQ^UZqp!T2-#_V+nwWi0=_ox1*-SzG$KGxuV4mt}0( zQ1$WZ<>hAU^WNOMd$02ErJL`sugjHv^Z(Q1xoZ#pxvO6%`&NJL68;B)#$Tsc-Pmnk z{-`zC{?4s?cd9p3<ZRgd`S|5-ZhzTXZ+QFneY*56UOWF?o=j|2Qn`u5IUct5>3%%* zx9f}Q(%zqMzWBuBC!3sRu>Z=|>vq2qHm!@BS99)vb@|V?{L|G>+kbxXNc!^qzW<8* zpC3_i^$8j!M#F%TbnuTY<3W<Nag@nM&{#I=NH=UW2qXhqR|ulj85kIx^YhX&)1l>k zv5|gaa&l@8q>$E6E-fm8j)F5Ypq54;?Jz8Z#h8Mk{Pd#K;^L&lB7NLCLFE{_1JZKx z6H8L@>5{@`Uw%PpL2hbZDIVLf=_|?4&nZaEOU=Qf51T7NR*~*EkZt&N$z$;wxVkFV zcgifuO)MzJ<4|lK1l5P7NYQ|8;ERWWfuT6H1XT5dhraek-7dQ2Ay9jNeZfoFX8n`5 zruce2m>`hs%-E-0DZPX(P11LDnj71{x@wgVRt+*rCcmF-(z|P0Yq@;)(X-1_Rs7-{ zGW$MVv$kGasjk4xvvbb9D*wf0Pfz{6^5Zp|y0Dys?Gc4ehgn^0$u9(Joz{EDp3l2q z)j6p&W!CFmDS|I{alVphzOyH+{&$IO=(N~KF1_d5@12>kBW|spSoh_lM>SVvetq-n z*REOj?%GH#nDgP~!d07=-(Tf)B;55|>zNrLH!IhcMV|fl-S3*)zQVIRuiyBQSp4VM z*(k?v%MbCoTr*dHefrv#=>Cvv@fJF9d!N0net*4u-{sHyn9L5{xP8ZP^^A8MQ-aqA z?q2z>%OWO6T<uuvopiZbH{%XI7k|FB|2XG=t{SV``QL6{Z9n}w=~BCK43EMRndWOf zCpBN_@+P}5`c&~Q7SdR@Sa~*=xRjhhi_J@6wJ!%BX$y91$Fyx|DYzc_TUe*I-OG6` zUvZ`S9ivd`qiZ+k>dhDDPtAK*yCPP!XQAlqa82>8QHjZ>^4uEx!-6(04KLFwoe-_G zjX}SE(p{HigHvnw@9*aP-zm}Saiwh2*?^ggKdd+%He-fZm&&Burm@q$s6E=VWzoH_ z-m_WRuGZZ?pT)ZN{R>474xW}=S8=Wx`<_3FE;zY+@3GZ?`3217&Mmvy>SE}8=a2f9 zSKN)=b_ZwtY<ny=nX%*3MZ2l@(+yW_JCwnpc5~eqqfdwVH~;vmCS2s8mnpkY(YD=n zzg_X+u3vIfzL=(Ont$SeN~T4uYzo7+>CL%8+P?of1YN8?1jR30n3cQuR@*%zpZ7u8 ze?{8Qym{?w=26@d9_2Bk;ZSG1hs}z;Pmh|^?8zxQdF9k<IgMAvBD1G1_VE|x;atoj z@NUP7t6_^o4!)N)J~MA$Yv7R-Kj!iVca5A`?F-yCn4IrAYI{@q|7CaaX}=3~!u+MC zY~5RPUD)T_%%sdxzGSfo_9^RH*w|D597zcC;#*Lp73eryJ9kCWLT4W>Uw1pTixO9? z-L>==zqzldrgej{vw8BWZXFp0?!d08(oT2!w&b_9DFxTr+>&v5Vf3Jc%a*~h=ZDi| zC$aqS%+%7KMmbz+&u3K}ON;34R1oyzyg6OSca`G&Dz0riGWmE43i%%_`s#HyfAiDF zdB1BaOjCTBA7;mIU`W&8$dkQM)!v`6PxaP^lc(6i?2C=76O0lUt!y|eyF$fa``uIZ zZv=NcuIPKW$*^Ph*Hh=udR8gLluPkDY~=lC|LLBw$Yj~p?4;VR#YNkcSGF%(7#_Ip zUz<<=$6YZ&|5!c<i`2zkVmahw-nGfmcVXYkgJovrH+p*(@4NI(wnwefRa02|LRo0+ zFFjS;qvBeRZ(851cWGAp5h9ep&wK1j;-~K#&rk6E(dO@T*VWp|bc^wVPTp>T`x2=; zr&w(<+jVFePpa;!-P*^#8YOf2ik(@`HF@sJMRU~Va>-6fd1QVyF2-Hou<ZE#R&fFQ zoZri0%g(RAxNv9Zfz-)2@}}_K<gO94NLwG1al9&Bdm(QUpIfYWiblno*WoFjQypHg zKHR*>Jv4gFrKQt_#oQDl)+olZY;oX-K7Q_-*tVtH?`!97xD;A+?)Wsxi|(xpWL=Yl zSIju~_50edTAfGk96rbLt=)R^8q=akr<IE{G&rA$InG@px2dy`cb>A(aW?77aP0-w zN5x()+g0*W^Y@mB93j>)?t^{rJiBl6{c)@Nbopq3*>s@`k>-B6-3zQYo!-j5Z?nR8 z+uu_6C+jxeI(l+b;k|Wl-35*=x{xvdcj@9?pI<j--FvSk^`8keC?KEu$H|J7fnla1 zY)}AOnJdb`z~Ji|;;8HC=cb<vo06%>&DlEbe%>Jkkz?<_i|}smKKg2!PVd!}yxsR& zs~MIopW4FI*nDxjyy?G}X6E6_do?E<e*c^6xro|kU**ZB5$apCJf`t^GIhNUTCqz( zzI*#Y;hDbg)p&y!F}43(II*Sjl<aQSk6}+|Sw1W}Dpg?lDMmVA>9c)|tv|2-*b_SI zo~n9*stS+Og1c(!h8nEW53V^l&M*urXgjDLyV~e(=F-?sk;LXFCc8g}E<RWAbg7I) z+v=tOr|L=p-qnp#q4SUL<?g%JwD{(R-#zZX%ct!<Cx86rVeeZ?zdYju-gst*ceTk{ z`ag|I+&wSX!9K5ia!%yk|5}9--p*Hi_gdfaDhQAGFMXR~llB5}p|p*k&ITDO${uz8 zoVN2~<NX%%>#KyeZr^>kRQZ$zJLkgHx=-)jdTq39f0zhka;faA6S|Qqjb2%80;l(y z-D`<o(BJI7c4;D4icC$m&8=PA?=3!9{`BpO%?H|MdZcc<-f^hx#D)0DpYQW0KM#%m zy;5BBeb%4z$8z}@9C)Tre=KOQ(Egp?Wq(j?{y)Rq-Ob3r@Qni$n~Y2%47j(Dz#2pl z8n&S%z#CN;`d$%;Y7ps5>dq0E(Xa%DJP8O|#sD%Df_a5d7B#>$!divsbA%90Ao8Uc z0|PvH7Nw-Ybi%Ae9s&bRpn>#2Fq;Ge17bQ2rjG%-{RVx2459}_#wjo`<U)tb&~?S3 zb%H_aA$Wrd?k+LH6vS!|<TfzG1Q7XMgXk79!cfSFA4oe4gIdHeHUop5CZ?gN?PJsg zg3!sp0BU<8_!U@8LbR80n*wSWB1~C_#gqc1_A{~OJi}rRsD)bqZBOGi2h~?{TA1O3 zXkQa+N(dHHK(2wbw{e>SYH=gNrVEQH80~M|hM@-ZaV&-*x5pvoz;Y6Dy#Z>GBOLW# s3$=X?OL{P!$Ym6$UP0(AHo#HK1bDNufs_d`2r-;yVPJ3s?Kooq07(h5Q~&?~ literal 0 HcmV?d00001 diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 717de3f..743a845 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -4,7 +4,7 @@ import unittest import shutil import os -from src import pdf, png, jpg, audio, office +from src import pdf, png, jpg, audio, office, libreoffice class TestGetMeta(unittest.TestCase): def test_pdf(self): @@ -46,6 +46,14 @@ class TestGetMeta(unittest.TestCase): self.assertEqual(meta['dc:creator'], 'julien voisin') self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') + def test_libreoffice(self): + p = libreoffice.LibreOfficeParser('./tests/data/dirty.odt') + meta = p.get_meta() + self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') + self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') + self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') + + class TestCleaning(unittest.TestCase): def test_pdf(self): @@ -153,3 +161,19 @@ class TestCleaning(unittest.TestCase): self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.docx') + + + def test_libreoffice(self): + shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + ret = p.remove_all() + self.assertTrue(ret) + + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.odt') -- GitLab