From 12b3b39d4d5520af04233578ec93138eb192621e Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Sat, 31 Mar 2018 21:20:21 +0200 Subject: [PATCH] Add support for .odt --- src/libreoffice.py | 54 ++++++++++++++++++++++++++++++++++++++++++ tests/data/dirty.odt | Bin 0 -> 14114 bytes tests/test_libmat2.py | 26 +++++++++++++++++++- 3 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 src/libreoffice.py create mode 100644 tests/data/dirty.odt diff --git a/src/libreoffice.py b/src/libreoffice.py new file mode 100644 index 0000000..b7e0dfb --- /dev/null +++ b/src/libreoffice.py @@ -0,0 +1,54 @@ +import re +import subprocess +import json +import zipfile +import tempfile +import shutil +import os + +from . import abstract, parser_factory + +class LibreOfficeParser(abstract.AbstractParser): + mimetypes = { + 'application/vnd.oasis.opendocument.text', + } + + def get_meta(self): + """ + Yes, I know that parsing xml with regexp ain't pretty, + be my guest and fix it if you want. + """ + metadata = {} + zipin = zipfile.ZipFile(self.filename) + for item in zipin.namelist(): + if item == 'meta.xml': + content = zipin.read(item).decode('utf-8') + for (key, value) in re.findall(r"<((?:meta|dc).+?)>(.+)</\1>", content, re.I): + metadata[key] = value + if not metadata: # better safe than sorry + metadata[item] = 'harmful content' + zipin.close() + return metadata + + def remove_all(self): + zin = zipfile.ZipFile(self.filename, 'r') + zout = zipfile.ZipFile(self.output_filename, 'w') + temp_folder = tempfile.mkdtemp() + + for item in zin.infolist(): + if item.filename[-1] == '/': + continue # `is_dir` is added in Python3.6 + elif item.filename == 'meta.xml': + continue # don't keep metadata files + + zin.extract(member=item, path=temp_folder) + tmp_parser = parser_factory.get_parser(os.path.join(temp_folder, item.filename)) + if tmp_parser is None: + print("%s isn't supported" % item.filename) + continue + tmp_parser.remove_all() + zout.write(tmp_parser.output_filename, item.filename) + shutil.rmtree(temp_folder) + zout.close() + zin.close() + return True diff --git a/tests/data/dirty.odt b/tests/data/dirty.odt new file mode 100644 index 0000000000000000000000000000000000000000..926ebff39ef53a4d3cc5caeccb5118383482e8a4 GIT binary patch literal 14114 zcmeHubzD?i*FPX2-2xJl(#_D_AzdN}2*UuQ0|Ud*f;5+wk`g4OyQBo9MH*DPL_$F6 zM))1{diCDN`+VN_^ZWPRaAxN0v)9^j*7~ls*I7sB))jOz6cjAr|JsSPd6xhlD+&tA z<pETo*hB3hNDoJdrK2MZ3bsT-;SRj64%R$yOJ}Gv58M&rU=0Vm*h3tUJV=N;Qs-~X zf@B*VT!>LnE)Q;inGO_;bU{FzdHMN%ErJ5y&jk7Tl@x>o`9X?2jt;h=I$A3DI8-<Q z9ln~Xq8{)q037^SXuu~$kG(4Jh3%+%#~B5MWD_`&P)+faC{a*|($o~?42;`r?mW=Z zzdbf9Cg#VCAja@xWBlWuVm${%c<22bws-e2phb9LS}y7Z<Z0q>$mAqAaQ5{eI$X1C zS$G2En`~$tR(t_}V4`DZ_{m}-0*KngSWizYPSW)@9()$r{X&=QJmn?hRZ{b8znGSw zi;^LbsEyKDlqQ0|o$oFWMX4AK!HG6lhA*`rUG9~Tyxm8U?8!M-zs<RA9EOfo%BpNZ zy!Hi0EdSNb&9b7R_0?5btYMJmXmvuuHTl<*laq6Ek3MvS4u!Wbyo~f=rmvd7t7=Q? z@9)P9XuZeBhbL&$mxN8qqLC(^Fu0Q1Zxcu)N3@|s<BG-?YhKoZ{1mH|hl`E9y|WXm zo`ctp71XvqRwf}TN`#&{wk{<lm7SH<6-C?2*EfahO--XYdhOaZZf<TZEiE!7&2&kR zz9fE=5^{#O6BAHNOJZVTO9%u5<=nVr0%pT+XlR%a9}k<Z8}8|WPQ9=P=<xCJtt#B7 z>zqb&HNB2^AV?%Ax;qsFjnyjvQAJ_~FtD>@D^p_NMi^LF%#Ds_>lGY5^Z|ZW*$z@s zQ>Q(BYQT}udhd_$@ZFsqgo{g)<AOXAS#8X9PI{eMqqDP9uK?K{MH>_p<l*5_Qc}{| z+L|hCk8pIfb8sNX2@+2mn4L`@T$!4f;Ky@5-dWO?KDKW4#~AXCPn%z84<W?G#jS<U zIe@5ys3OhFL@4pZ)1rEp-Yo5SpB`YN`;)S29i5)8EH8I;b)B4^wtK3sTIlQRo0+90 zBqTg}@+3W7E7n?5Gjamf?6Us;ZGpb`!-rjw)GDJ$FRw=BY<@~SW;GTT7Oiy2>tGF{ z4mP-a@x2I{qxmaWuZB!cn)uV#-7S@+-`d)m-*&gRx5o^SV~JY=dh6)mGJUb}p)*|O z{3shJs>}kZ{a9d0Ny(>ApK9nDC@W*iv(eMvyLWGXyaJr3{VW$sC5(Gxc>X<1m>Mtm z{M;ujYo1&hsFmhR)=*GT*yzq~AjHQ9c+WO@iLkO>WvKS~>REp`JtYNamWfzi1U`#L zK)|9_Sy@TKKz9B5@z#QZic0tDs@-@nx}5I=yOiZ^cdE!sYG8#1SK^qVASw?@b9eWn zj!;teun*hr0?8YnM)Tgke~&JQhmS8NCf1)U7{aXfeg?07%l#!jweZKKCCy|3C3>Q} zEk0j?wc_F7iKdewARthtTnxBMD1fK0r{_<UBIV_3YC6>#aFxG{Oeyj%C#3>Mf{l$$ zv(MK_m~e~dzB*$J2n3RRbS}xty0YA#O1T(L!J97TteiL6<i4HN+-#mVT2oyO^tQjZ z=X>*&jpOj}u&$1d<KC)n;L!GBx0jcfcDiIJKa0481Ry)NhyrVBgo+n|sqHTJUtzQq z5u^48gx<8)AwfOI&(AMCJzY*tPFh-edwcu*{G3dw#JGZyfnlLDoPt`!VQ+VLcYhxr z4=-Llr=p_b=FOWN31ReDPjV=OlO;VWo0_DEnB|BTyQ4$)vl|*3IB%EUi(q18B$B^M zKqKmPP^_Gd$`n}*h?JSKa@VU@vIYiZx4v-kVxjwIWif^-BDOztc64-n{AjJMt(~cZ z1;1T-w?Kz2icUsaRP=QeUc0pC-s;-BBKb_mu88ZDi@Pg>hkJWO%Goi4D@=iBXJ<t6 z_iIvmmnz#UTw{BefZ<_dV*`y^Tt8DqcP}F90YIXh9vK<21C?5|%yxD{-W3{JRfq}+ z?XQ=$xFQh#wSbnTiogj733<7=(2c0;(+36y06hb4c7QV`CMMQFjeMpo*8bt4LedqM zs)@cO1hB%+2t@POgQ=sVBUbHnEOfN6EV~pN%3w>%stK4tGLbx>Jb+kYWva!o01$C} zazcfN>3%EL+R14<f%EqC%nW>{p>qjgi4*u3SfNHhINM0$dK9;g&;0yQ1BddS`GgTs zQBmm?7{F&=%3<k}DG>#>-WzXubODIO=*S2#nSmulq%xBn5r915VPPh9j+u#xw~<IB zplCoZfW#5W_del{>s`VLvKma6!X@Vdw2;q9T|Epy0;)(&O-*Jsc^;#pcYJAOWxVpQ z?Le{2Y5@Em93C<SKG$QH&wu$+lAXOEBjeu6KpHS1j)b+Xg?0dc#1S7=U0q!P?F_rd z`7|x<%9SesFo@D<J$O(Q85tQ6aOKD(Z!}HZMV*Vh*Saz=5S=HHlO*(gsTnmXsq;cx zkXfTEaM!1sy&G$460-8|RM`M{a|bBK#=fUS^6l;I!otGOtE!wWEa>Ff07a6JXaq`O zA%}p#4lssGaYND8)a2&kl5}3yxO<m|`cuorxfc}rsl>E)ZEbB{&BP?7vC#_#+enje zV<zc|y?N{HeHSk;iNszZO1$;=CE*bfc(}O0!oBI~;ZN4s-`~G+;|8#Fjc%LDSODHK z1ma-^WM?zAQPPPa6j|a53^?fM=#nz3tE+QzlnM-NAP_ML2?;T=o+ZS1ta3In_-R~R z6}X!~My4exi87)>Jx8zT9RS(M$;m8nYCMT_lI}VsMvV;(J{MmPX=!O~2h%Yc3JVG< zs;d>TloS=;j*ZcYBVW(W(J@%t+UDu76&4l_q)CMEE0^yI3JUUfS@;~^l*iHlW=BO! zi_cb6SV*7z>WL44`hb2|wD@=*ZCU2%9q)Y_1)2bwvAk?kV&t{IW+<z3`?f6{KKJ(R z+o>t9g*FUTQOd==J>)(12M-=RcEid%I&u$aef;D}u3iCO7naSN*RLO|GUEg>3Ja4n zpi9t^be`<5Gr-Y8`0;iD++5Wd1u5yO=Tz3#))p7T=9;}J7r&k!awe!pI6)u~Jv}`Y zm8+>J!6701U4YdF1n21m$WON1^|MKYlbl>@&aK4r4E-dHyixn97n(VGO`iK4H*Z1! zlP{n7m>%m!^w`i4ik-3^p!0x-aFqxECvtM~7cX95U|>)#0^bJ*t`Sb!+Jjk2;ehZF z%YXXxX@~+rpi`TgI`z_H7r>o+!l`spp4D!f=B|t#xtE~LZTbRGjlILext2#VfENIm z8gcQIYZycGi<OlEB}TaAIJme9^71d6nq0(Zwzl}%`N))7ZIzUi9!krIi#O!vVq*Fk zO8J}_(-S?Fd1PMnP5{sO^x(6U*8wFSrZB3b+viD)PtdcYZL5l-)#2>>HR2!;5)eD8 zNE-mSP*G9o`+e7C$qWn)`_m*S7|2LSvWD$|yCi*nv<RqtT+he(c@*MM{)2-9VXDZc zrY6Ri*WKMAS!}V){WewM;ka@{h<X6XDxW`Bq$h%nm!rODv*RYj_RIYQpqI4APGI+< z8gnd@N>p8aeMKeZ>w=drUjkE;lA;9>rm{9PHbPTeot!Ld0B;G9O1Cf5*J)_*0I<Zz zhb&!*?!H_(K#sniopEt-DPyZSn3zya%Pz<fbpQgjGWd*+jEp(1hmnbCLaDrS)NRgN z1UmA3Ys;|?B4>K%&cyimD<Mf+%C6bj^ZoTP09ydS2cQ6eQs9R98t09E+V>nP$_`_T z)$HLCNXeC(hSrz}+bS^cxWAlfbmKLtemyfoOGej48HPsb<mgxq*tzUsjdaP0sVQn0 zIWe)*#&`vwvJ@1F55`IXbL0+KxnzOP?rv5wv5EeE`-6>%#}DtcI|JCL%m|jMaoe1& zcU~EIDmH2@fU5tP*<*Exp~<|(c|}Y09RMyc&DI6SJx_0C<zT?YLLy;yc5J-7hg(~| z(`YC9%p|SC8yg!2Y-9lD6_qvb93I-<8fb|}Fc}J$`<j={RYOUM#9_|>wEW|bKRP-T zbxO#TD&YM#g0GK9M)WF-?;0CpO%IQc*FCy82OK-fv*B!&L>{v@gM)+p{gBY$G$(_4 z*(i?ZqBH}5O;lHZ^Cp4w_~CgZwaA;^-URifYUrrHzrSp&RfJkBIw_iZ-HV!<YoUq^ z3=Duv3IG)#c~i~ar$BoECED8B`~w02;k8Q{c=HC$UVao5((Xq|i4j0!I32@$3>Z2S z6B9kMgr`sO67+R-V}Sdw$k+o6q!Xm5aOci-wOCvV9#Cq(^Ku_M2glmxW^Hx#-D<nx zP*PSvo5^|2@w%?W@!?%@`Cy@bTFZez#1|0BGsfftkX%#avfP&(8X5{HVYwSR8mij) zv6L?^K7P%M7o`OS#G#7vnL2D_lM@p&O<sxgSfPp|v$J28mV|8jD9bscyCu9&K=A}1 z%@(u0b8lE=1YgD{K-kL6o2b%@-mL*pA?<Ylv*z)i13VJ|05Jr_+xZ?_M<_48dWDjz z#1{5uU|_bvg>yOzCs#L;5`);s(oWbmK)kA|%CzRbp^J-9o!9gAlHy`T#uyEj08^qP z07_o*f;4aCjKG_Za{+ho`I|=)>c^wk;1a}bL}vhi2}w%g9TCX`tDKXQ0|3}7<6;s1 zuHHo6#NMT}v^2>ZgoUW(;_`oVFZQPht$*x^11|Bhvs-$6@wTj(HzrT!vqV$U1vU}) z01uIshsU8k7$49E-(Vp@IU>M?0vsbsDk`gW#>c~eo_zlN*|X^e_^zob?i5B-Rm`S) zH(vx`f0~-S!C){TmCVWG<HoVffb^t1d&Wy1zQDkI>g$V=rd12+B^MD9F*P-f>v=wG z2becEH#azZ^AZkDH>v;|>-Tx?v*O;If7Hz$Koua?*0O!_)xttTfX5?;6%}Do_kMl< zaC5HZ;V}Z(Uf9~Q2C{dFa&i^{n=-n9uY?!Pi?n-Vb|Ao7UeO8(HRLc<RP2Dae8uF5 zPjmn1gkLns*A);DfY*MuH>&N3*DC!;W$`7`-aRayM6aEXYO&0uBqY8tQJN^_Y&9({ zX~0keaO&<ZO4A@uO-1FoHQzd#r=6OTLWUEhoV^VgBm`n-f8VUirho1R?Kv)Mlz6~+ zgUfo+?K0yov?_?L?Uf>6Q&Op6!$*m5cDC(LNr;IdrlV`I>8Co_T7i$wwYV~-^6+&5 zOIlD+z$N8BCVjTmo|Tm~D03mn$EQ<ZP+6GR-?6l`By({lNS(Mz7%qG9R(7Z!u*ut< z6vxNMfa6ijcnu3(2QY6VfXToLk`K%B^@65+Sz9|<&5k}fIjJ_`R*Nm(Jf8@7)N7h8 z8X-k{Exa=(CI;}Ri&>fxOE_{wz_t+ZrU4V5LF#8zZDQ}@;sSx>Xys8$xN@D;;<u<! z*I(3_aFd1FR9#<_{-YegDNJ<q*VY30aA0!^>~1TZ7L|kKTMxYH4x4Vnnj&Ea5Ws}J zaNqvW*QX&GqOPva!NU`bfg2=$@W=dD)UOi8OoT}bXs?BNFAL!~u*JD{4MPPT%@4(T z5O4&6jR0WOE;>veg%hr#Oz*W5%5-#gwkA^-b<sDBm6(o=jfw0?9zH^QKqi``!4g+f zQ-g8!szPwOxJzMUql7#F&1Q@-5%G4LqoSMSl$*1hOc#v-o@ymVD@#k40F8%fVPO#@ zZ~5Q>H95JBz5Undi=D4QsONL#v**zc&(A!&=WI4d&k1GUsR)_2mhAx13m;DdW2By? z8@(d}ZUcou=r02;^iW467zM@ssZ>NOjxQgc!sgd7Z+X*3n1t8Pa;uCZ0lM7w1)(br zi;lcIzUHn#@VfMr4~)!QDTg7|q~gu;d5)Tq5&MmPt3Is^k@2KX4+vJ_9G+Sjj!u|y zLR@BM!G-JD)EZOY9*RCH4V#_)10P@{C~8Vtily@R{J+O<u&}Vc1%ocH{3y7<4=~&T zc^T<(w}+M7Hib_Klljb*5rJUizCD#zdG^)g-o|4PSv1(K5Qk_yHMI`+VbwQ@=O?tW zu^Ozdg^axcG&g)!wzecFV#nZa{;br6`1*(h@+yxMozQERmF78*eB7Tksm+zPG#D9n z@^Oc@rDcrR7^~s7VB6i&51ufbgx=w>PGY}F@;074u`mZQ!~sTr8p9z0A3m&XxY|AW z<jaNuBt&XNiE#Ax!Mt&*U3FUFM!D?$V)KH6Hxlht>Zt2kN>8MAth$rWY!igvzksB} zYo{h$`SlwH!U2_^9*iPbBp!23@gi##TYAfFvtwoS@rsd7c<#wcdBv?tQbGgmT+BOr z%!M}k+5>-RdIwWY(@TkoNFIs4#1)WwiA%DMW1T9<?-4iG`ut*g<_1D8yx@V@2C<=W zmFtf174i4C)gw#AgLE=xXTpykCWkl$&PwCX-M1--nm@WDs!S6d!RZtRdUZ{VUZBAV zD$df()b>=j?sfYfmhi{oPc2^@Hk}kO#C<1LTCc}-F?!rnZlr%Y^9fQ4EvlvsEyxwo z_~U)&swo&l#^Q^EtMB`i$8kE2wlNBy6c!-V%C#1B`|T^c3mk_T{7%=G6cQrFeI$@$ zdi~Q{L~fyvv9COrL*&1RsE+o^B5#B)-<4<6lX`KKFIvuz>lGG2_~@epkFyFIsL3a1 zl?}a1$}i`gDwQn-RYdBVy<B`>p9-&Ysb_X8&YrHSOuIe7xtX0_|GmehA^ISLj2;y^ zj6A3jgNC3yzsUOf5Wb2NNmo>mp=S}5O&m1~Hs=Vs=bSHT#OCc!wfTn2-B@Jk(DUm> zcvLUuED(guw^z7j&<>X4m=?7diVDzC$XblhdYBY@xq=Vs<Wi3Bo}jJJlf^x}GL1gg z_vHF6;~0VBBRRfw7bu+T8t?Weri2zZ)${&XiQpP_hgFX`t$P<Dbi2-<W_opP6N=cM zdPsn0PdA67ILso#_rHu$)!?@W)G1BQ-1<t7tKNbhsCKonN&UTiZN{X)eb1#x>Q=G% z)~*>Vb!(%VPm6anGm~HB=eBEPgHE5{l?Vfgm26<kmKDbwTA^U&*7%3V8jutl6?QzW z(cveX+FaH+e@x@utWD|`$>qVCgk&A$NP9HeI~{qqU5JAOs-C4=!k#dE<mXo56B+1x zYR(_cg~-WljN-14pC0mw#hyn!xzX15MLQo`DIb?)hB|}ixdPdY0zy*L-{*r|%vVuI zO;7L7@Xg@!sa8^mMs8kPM>v)sx5}x83wh})vJaI`488%a+95Q@3A+R1LxNAEc?AwE zRnHRd_&u_KeKd2occ0X*TvjlIqJ-pk1}zr#-)gTbqt@TMCA%zVW$QXOl6k|bKF+!w zH$x_CUe%({ocBpt5}tpPj3q^4uUt8#V<u^(YU%8q6_IfKyGZ<zr3bA04&MI7%*Ykt znaz$o@P|2ZNH+W35+CrmYf4+H&5a9go|Cz(qvO0@rHY3l5B&1Xg<w5=!zZ{=iOF3v zZtSoSJVx}~7>wt!wK&9sJAOmbNPGI5EC)AA!_lF!rEs<tj^lg~tGDl@f?A;UVDuWj z%r2V6#Z{eKSFu>zU-)OEqoA}A0@3F`W7FTUZB9y(e;2?sG9a9c2YlOGIzVk8&PX1F zwM|Y;g+l<a`EhG*L)e`8-wDI8wu`U7-|`AWLFX0WadPaLw^`Uo7J=cr;e&Vehp&$p z#P>G(Mc0PNC5gQTB1+1?z)ZJ#M7FQU#Or=pQ<kFB=6cRu;K*?_ajJ{jx68F`knw<_ zrxNdg<avYwV{YpH36<&Owty8^ZG1kH(dfNm3%P|gg2D<$zZCH>Cy*<H2^uV=RNyfd zcEC1O*R96I4REwiXAi|gzp|sMT-MKLI~#ZlA35~5a7qZ%7?Fr0A<gsm)F(JATi!?a z8Q(3k?!Db#=3%<v2ym%NIOZ2Z=Bz=Is|(aG%xd=gEWdsb@B|j?@+rXYfd4W8?~L?- zL7Xpff5ybLY^sjfXS0m2+HTJ7&O!aS!&7hLjDEBpa{EfQlj@9{lU7WjXllo?7A=C$ z*@j<y%>Kyi<9Nd#Jm4t0+aIK$2ltWwIAPcQ{6m&8#;`j{gfVe=AjOp7)@k0O#~xd| zQ!OuvQ}LrG1c;pys^m#?qOa>SCEd->xmoJ3PJT=7+U;KdWeskZmt;3Nr=RBxHi2$U z5HrD6k?k9U?>90+w3KJ#>XA|pA;7s7^9*5H#7o}-8E>XwE$RFS7<#CZxnBQZwW#&W zqn9{(CA|>JFNdewKEmV(YUS`wM$lX3g}Mb!&gAQ*%mjD39Qm&)FR~$Dij5-keO|8@ z85kU=^t2A?X!5=+be(D?u9!NICCxJ+h{A7wxHfx7V7jTQ<5{YH+QE_a4Ih-O<IVO{ zVRCC}B=<)tKaM9aX<)fa2P?P?REAFenmXBEkOCf9pW)G7j}p9j>upBB(Pfq0pm6FA zrCAKBEhPp=q8IIaF4H8DUlPtJ)s1ipsFj4<amg|S?MFD@dAxsSuIf^3n#`P5oH4)^ zXb>RFDlu>%llJ!6`Vp^u``f^3J;9lpuDpvtRVL#P^r7n<>xQo7ui5I}#**U%V3A;| z)o{tmMdFL2*YL@gkmci!65W4Hpj4Us##%6#uGO@b9fl6WqNWmzEf>{?&y8C<1gji~ z$T#E6hR3fDOPDiD;wV3Eh~izDaJqID`9%Kf47l=&+>yTl!G2eg;K^&|xxHJ`;Ns3` zt-wX}N(cs@?1p;G=!d}UkK(}_v${Qz51Qr0WPKhxyb_TWs##HQQm-LjEpzNLOo&pi zIujEAnt1J^)NWfw`s5XuU3t^E#ew$aoMv}9@5^<kDKs%tk{6kVMk2~uh>~@q5rHMA zdH40X=kbs2^qlRUmrEHJ#<N&y%Qjo3&=|E^(<tr!p|IyN6rG5~m!R~+e@1LLLOF>a zi__EMiwdfm9&Q)<LVKUoZA(6qhU?J3_SSuT9n9s8daY3V55gdQ^M~;UdI1U1@i!S{ zG84$_u+Z_flgf)hoce3ze4fQOvxn%}@dw7D{2R$Q62sARZDUvW)nsMs1gELWC-zI6 z)WvBtMFR$v3izx}-Z_lxZsI=R=tK!dOM5^bt;CYyJyPl4&_Zm))Y;;jJ6!$NhAX@C zMV2Y0RnYZ6lnm{#b+=EuyB_9Ra<V*2=89M|%CNophFNEa{R>gtHh5xwu|r}l@vZKf z3(Z2}9M4TFktY=aqX!yR3Yy3DeZhO{a5>ynH3xjPx~?m&?{v_HM!?CHZDeM#!_27F z5j;g1MDvGFMb-!->MA*`%<lMk(FY{l6Un$hs5a;@Vuws~ImYxHS@L_sNS8tPZ^>5m zKY?R$55m1qSED1O?~VrPy1l!IiMekCNBe?gJtN-U2%&a{e3Znz*~`w0m9BJ`UW+&B zje3qPSD7mLl)2|V7ubh#LVwdlwGTf+B7c!c@jCZ4<^2Z~EP5M*<RLLtTwnW))vCt@ z(mqs>RBh!1eZ{bcJqmoPp0D9QYqjt+Q8!0c>UExQBhOM37_`09aGM4XB*OCCGyzS2 z%=QTV$|DWdN5x8r4ZZc8$|w5RLRaZI`*TnCtnaJdGR7_0E*843y)39A$d)=&$JfZ; zf3TNt&cP@3t{v21+qsyjnLf7kC@-T-6GAqEIgHynx<te_3Q>+u?^TvQ7DC<bZ}w^J z5BTg&XgX9&iMZ(~NG3-AL`zbNcI}g0oqfv?33yN91Un*Tk27xM*2~%Fps|DlfqqbC z;e;?=XEn)}Qu~o-`BGkWwC&f2`ej)OS>s!M$}`@+jq16527<Zm8$nn1#RF)+a#nAK zY_8fkd^B^}b=w)|9$k2L04f~tqjJ12BrKPQlP8xIeN#e-B5EU~fq7w0<<M=W#|r+G zMlM7cd4K+~QDq>g?JYziO7fmZrqWI72=1OV5jFPm;IJKk`ijtctiS_}2ew^A<nA#Z za#E>2e${?T29xR3yeJomIq0?#*QY9qZE9QnugoI}X|2jW8_6wRD=yrB>)d_BoLL8@ zWhm`edr}|3Yj)-O$bHj!e7nSP!6zm!u20L-u^r48><_<pvzteQZ!aq@zVH4ZTvU`E z>fxX@UgTT7=pxiNt}JmtzM$poHyQF#HX_e0cw2$6i*SZgW*TjABF*w_JYX9v(iL6% z>S$qr&u6XmsDOGg&-C7zyDg>6;k~oz$n9NjKA#4`h!rWy!AAb{Ikd$UiFWtlfbA^m z#ZGR%^to$Yx~y%w6wSljLuS(y4U0D(cF$l%!~LB=^@5M{r0BI9<<gr;KI+FtKV$&u zy9dr}*&AA<3Zv^;h`|Dfjv9Nhb?uA1U(XB|w^3w!4rv;sZ!b5!eg+<#I?4##nYQJ# z+jPXg6SK$uy73zF196N|Kq<s?Q+I>3u71i~(qj{Bw)IxegmS1xD3YQAG}mlty3%w^ z4gIVSJkyyU6X173cf+X0a2sW+JNy`Uk=d7>5%6h4Z$RfjvQ@itxxc<OA~yoA4`N&s zDt$sQ4`pP&%CxrWu~o#XR!_jgOZ#kP&{pNu!N_*(`|5`jWYC%iYG3r0-()vLs>F^e zfN5v1*3bpK#S2T&dKzRSkgky~MKit2c1O^+U4u`fs`F()sKXX<Wn2i$MOpldFST-( z-8Y}Zn-2E2%RZ;3dx(97TAgE@dp$}FNHb&4K0?^|1U6U9&gWzZlktMfSUwsE38cJR zlZ!|0Z}*N(gx}%C80ZL@u|R40N_y#Zg~{Qq&;!W>a$G;1uJ73eGdl_dT!1@vd3?(* zfFLa|y$VSg;Dg>3f^Y`%3#1tNdH5LVAr4@;HPpdYiqX(OnOl@mMjA&FZes%lL&Sfi z9B>1<36}H#se`ll_e)ZYE(izlZ^;Sb4wm*1XK^H0{MRf6@n2huf9vRbjXMnL@IZ>u z4vBOW=j8>$NIY(WJaB|9FF){{llR+I#y`8V2LIL0(FFnf*2)^p3xPo{^B{oi1|ELi zA8jv3_&*)gW!qne1c$@_>gaL~->3MkJ0N2M#QXjHM^Cm0YirnF?SGj7&{mMwme<-6 zY02#hg}5;>(*L=Rzbv=_BmFl9;z)$0gR>1BVgD`H<1c=|k^|`;mq{PomqYnBi0@p0 z=?d_=JHwE?aBCX@z<cHaP8g+s+^fq;NlSj48q@)bgj&M5!3c;Y5{{6*?+S-HLmlWP zd4GHPJJGksfK+e;<A+EK@bU9=^NDZ^2pjMTiVKK{gGB$N{(Cz~Yq0o#5P-x%eB%7V zlDt1IUbgzr!TeJBeFzY1VB8M2+}19LZ;K+WV<5z@|7XAdaq;goU=J|hIZF%tqxGNF z-`m<k93TK^U_~^bRtSjpHzDU0<Pl^xbbz|k|59lIM2hZ1z(`&}K|XtaF=lxeD9oB$ zfbR$UpPasP{!3Auk-*5Ekx(%Gx1~l}TEQUPV7Q9|Qi>7KweRb1Z)y8iCI6q5aI4Gy z{@K*=H!U445tg<HOGmpOjeoFkgCnee)B$4ltuL@0&>D<{Abwon=V#=Vmi$ws#V_0Z zJo{@F->QFX<Gv$`IWI#4F@PqQ2ax=OVqoWDZ{=VKg*o#gfBgKHY@#j{6jYRN*+jN7 z=@cj^SH>?BhuoijeqZq*&5*`-A$7{`N&%H>Y8X3#IOxN?O$IF&hAAh3d<f4us=D0t z+n!-FiZq;a)cz#bVRo|cIDcei4lQ$voyOx})y@6)!`nOlHIjo{X$wymXxxvt`=h!S zQIR$m>$qEWF=YH|#Bu+Vj|ylU)x~*#-tA?cw7V_mXWg4pZi`j>{o5ZV4tZ(l2rvnK znooL8dfd{RL`<45Mq_Z{O`?ZeX9r`?<{lpJtd;P#n0g#ew}T~fb2aZTZwc;Rt=)w6 z(}<maPAGdX9Bez#H+#3y)vmhSbNk(Bo^YAN#_Y_j$JddG&9PE@+Kr2y#k7*vvl(6= z+RwZt&k5&s1^45{#~WqbJ_LbfrYC2oy~^ET<yE`gqv5#Pv?&|-(ak#}Uo1G`UmR%6 zJgwY7lmmE}A&FXekER|Wd~7xc1BO4U9%J5M*Virx8k0V`S6OPF{&3*J%kB6xVIpg< z%<k$3>e0EV3x<}J4rV0{z5o08zcu*(k%MzgyLl^WK@(5~fDiwObORBOzarh2d4-qd zGWAgb?qCD8{S)t<1$ZsNV8Fb818H8c3j*<H6#U9xL8b3#^o{<{51QYw0pQyf$bWRU zvPAIy9mjs9yu`784!{Nm2khT}qoVo^l_T8I#Sw5yT>eXMzoSFK;V?%_2MFxH(EV<d zmtFnu=XTlKf2X4R&D@}u=E|8@9*VTLbaehN1O43uFYTerFSp@$fv?M`pfixo2$=qF zp|2*BDa52Ik@sAg<0skx-k!;LP4)T5L|_HfC{7RR=a@FvHH&PNG0%Oxn771zX&6DL zA1irgq`bxQ8aw;*Ak13kve2lV_Yx8%9<20N@amt=dTM98F1<N@d$f+pN_s<9s)IgM zHYE%byh`jXUv|rE&|%IqG}gsBWxc_gc(nn0jWS^RrO~5Pq?F+kvpYCEE48zINp<EW zJmle{ot^B3(EW{*lZKSp8A+;)r(2&g3oG;H3gtU)D;)&&B^gY3l(^pMKR?uZuiWg^ zUq3c}Wa)g?)o&tqkk^K<_+Er{z4v{!X|%z63sElfrsdCGn`3Uxqr1&$!foSI)BHtA zU$El!%XAwHzJ!V1g;B6{1x?%DNSQEiU7=X1iSEX}z<DS>1wWX07t*_KH4-9t7mxlq zO~Ctz9`;pke6S*_h9`bD30rP9Qz{Mx)eXKt$xoy#d#xWhiNiVX2A2mqj@>yW<?;?u zS12KH_F$bBFr@A*sj}yJMnMR1_~M;!b}b_FTIy|fiW(D3un-+CTZ@riMb2$k4wuKK zj4x1lqhn_j!F+E@T3W)fFG49JRo}W+_Uj~PZ{_zIB_)xEF~?2`nLXKK`S7wjYj*!u zDhB2|pQ*w77$uvl3|Lrrf%ZxiI7!VbA50y48k@R`z7i6N+!)B62vX!%m_B2zUc>bZ zmu^iu4qhaWL;YiCSUO?OmOmfJKfz*|DBTm-X(y~Y+Gim}$nrpGGZ~~plv<>n+rv(7 z#P12&Rz7?EiWw?uMr)1oA~C>Tk5lvf4`M~}Ej^3O%=`A)lfknB8k>3#zLJIXZLDhw zt2zhXHc?ITZ40$fmCSEi>I6M}2}AU}eN%LUZOxf1H6dF=`x+j0HagLly8L%WS!Au7 zw1R!<%|W^y)>>EGd{x+BDIpok<)Fc^PN@m%i%}JdC#OzaM%q;IHBAr4NHq?Ut)MOh zU~=QD@uh*7S0QH|_l(pDG7ucPa;cp5`Bs?<8XTG`(k#Q2ZzWVXc(XU=7+5&QQ9}db z3d6Z*P;hm_5~$^;qpIP-!HoJol9M!ws{->#94QpJh$Hzpd2;w|D8%Jh0EWY|lH%TF zV;mkzPppMKkw~Ih$neY)=S3Zq0MF5ha6W5ay&qn+wCHg9&|Sz{^UA^li*ghjHY^9) zanF!wyXKpdTRm?ujbxk!z3vHEW)=AM)8;et)y}+mv_ae`mml?|lK+p!{WpXCYMzXD z-KYp<EAY={c4h_1;%I{&Sb2wKBVI5Sgyd%4)-62`)`;G2xT|-LzC}vrV?Ki3CNC0J zDW{nkRnY1x>^2@5k=;CUKpVm0p~OzgIpk_+cEWR0s*{3aaYAC^kzxSLkpam)Lj10` zmOF=RE3XO8I0-{lxH;<4CQ*mD92$w{C?WOn;?=?pZMk?5?!rdSu6+S8jwX3u9!^|Z zLDo~2G#uJ^>klID%<ro3^1F7=1yK;mz)o|`TnEdBGwVZNLE^?8;_)YN9}<h&l-;%K z_O#{9#J3_)Hlwg+bKh9MZM_STUBy_a%2F{jEgs29Bqdj7FfL{=L$8*_GVLBXAb*il zJI85XK4OR%=zem2SS2WfR>_JqKWSk9uw<Vjv}3w`0R13DBCc2naYw!&+l~!;nOrWd z_(o-@6Mj0AMmHw4$8F9GiB9rQxedr|_S0%(7zu_EZfn#RweTr|Gi9Hh(N0I<M3SL9 zB3kz084{JfHMq@H^oLTXRC96Mev_R&l}@vzpH+xDvxe-Row{T<?5_LWpWWo3x<I=O z3eZ8$<i#;iP?8yb1O<MFxz_;9)l@Q&<5tsB=KbrD46sA5c{1nFMo-qYc}Rv|8{WD0 zge&r$wL{}<kQd7Hyo5kBzkuOdI-&DV!Xmesn%Ez=Z=T|;kg-&0GQ|lQvsQDcJ|R#; z3tQLAZ=k0OuWcnw*4$*l*Uv%=InIm;^ms$th_P+7lp?l(=%jKK+qp}vle6568gx8% z^wKb8_9m<2O=dj#j2RYIel`s1`S-GNN&I?_!L6)jMFKO>9J5d|%Ycud#$Chg0mr2r z8p_}ze;s)*4<h^`KPtm#-A%Yrv;Nr=<);xUr*2Q02I#sc+HXxVo~T*qY^XiB9TrS0 zroCih*_duGE92l62fH(K!Qn)COX00%lf<;T<89*$>M4{;&I}3?n~I%&J$?q-PK8~Y z`eDDhK#{RRlA7Aa85gEEqF1pqi@29&C)WjXTa3t1!7j9Guet9q`>Ed#ChBbxo(;6f zhz?LG$+5(-rg`{4a<ZXzHoMhr>GNvUtKei+$cwQ*+FV}`S;Xzm5rS6?O-~Cb*f;N= z4R+ZRqR8SUCN2{5Wy*Zv8P&eLn-_go!oyKfP!517j?OJqG%}Qbe}x3NrN5LPFO>X~ z>Ywk5{3Ou?e*UlC9Qncehk*V15b!dM;b*}o`CC%M55gZ_;Xj`d{v`ZF{`V(_KPZ26 z`D+O5@&Ve<f(iIdKOd+4p!@Oq%|8QVKk3ZrfgXWv@nyK|pH%<c1plNeXa4ss@lP(l z_kn+MIb{1!hxA`O;-5@^?-BoGD$V|%O#kj5|6QzpQhwj*|2%j94X2;}^1pMrbPRu% z(%*3T)qnn<@>%{3pI`myf9G@QQU5GAIR0~ee)`w{Q!WO-;quep{&y~y9{0}@_8Tt$ z;eY=-v%fEN_ivc}>W_ct^Fxz<wHucn`Ok6z@cGq0{~>xmD1XJMOS|&3IP?9BmC?C{ UfqB`C1o-O(B1&?XZ=9k0KS>F)Q~&?~ literal 0 HcmV?d00001 diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 717de3f..743a845 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -4,7 +4,7 @@ import unittest import shutil import os -from src import pdf, png, jpg, audio, office +from src import pdf, png, jpg, audio, office, libreoffice class TestGetMeta(unittest.TestCase): def test_pdf(self): @@ -46,6 +46,14 @@ class TestGetMeta(unittest.TestCase): self.assertEqual(meta['dc:creator'], 'julien voisin') self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') + def test_libreoffice(self): + p = libreoffice.LibreOfficeParser('./tests/data/dirty.odt') + meta = p.get_meta() + self.assertEqual(meta['meta:initial-creator'], 'jvoisin ') + self.assertEqual(meta['meta:creation-date'], '2011-07-26T03:27:48') + self.assertEqual(meta['meta:generator'], 'LibreOffice/3.3$Unix LibreOffice_project/330m19$Build-202') + + class TestCleaning(unittest.TestCase): def test_pdf(self): @@ -153,3 +161,19 @@ class TestCleaning(unittest.TestCase): self.assertEqual(p.get_meta(), {}) os.remove('./tests/data/clean.docx') + + + def test_libreoffice(self): + shutil.copy('./tests/data/dirty.odt', './tests/data/clean.odt') + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + ret = p.remove_all() + self.assertTrue(ret) + + p = libreoffice.LibreOfficeParser('./tests/data/clean.odt.cleaned') + self.assertEqual(p.get_meta(), {}) + + os.remove('./tests/data/clean.odt') -- GitLab