From bee56a57ce0f45c51386423d508ba06836be366e Mon Sep 17 00:00:00 2001 From: jvoisin <julien.voisin@dustri.org> Date: Sun, 1 Jul 2018 23:11:10 +0200 Subject: [PATCH] Remove docx revisions --- libmat2/office.py | 81 +++++++++++++++++++++++++++++++-------- tests/data/revision.docx | Bin 0 -> 4701 bytes tests/test_libmat2.py | 21 ++++++++++ 3 files changed, 86 insertions(+), 16 deletions(-) create mode 100644 tests/data/revision.docx diff --git a/libmat2/office.py b/libmat2/office.py index 5381eb9..acd8ca2 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -14,6 +14,24 @@ from . import abstract, parser_factory assert Set assert Pattern +def _parse_xml(full_path: str): + """ This function parse XML with namespace support. """ + def parse_map(f): # etree support for ns is a bit rough + ns_map = dict() + for event, (k, v) in ET.iterparse(f, ("start-ns", )): + if event == "start-ns": + ns_map[k] = v + return ns_map + + ns = parse_map(full_path) + + # Register the namespaces + for k,v in ns.items(): + ET.register_namespace(k, v) + + return ET.parse(full_path), ns + + class ArchiveBasedAbstractParser(abstract.AbstractParser): # Those are the files that have a format that _isn't_ # supported by MAT2, but that we want to keep anyway. @@ -72,7 +90,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): zin.extract(member=item, path=temp_folder) full_path = os.path.join(temp_folder, item.filename) - self._specific_cleanup(full_path) + if self._specific_cleanup(full_path) is False: + shutil.rmtree(temp_folder) + os.remove(self.output_filename) + print("Something went wrong during deep cleaning of %s" % item.filename) + return False if item.filename in self.files_to_keep: # those files aren't supported, but we want to add them anyway @@ -118,6 +140,45 @@ class MSOfficeParser(ArchiveBasedAbstractParser): '^docProps/', })) + def __remove_revisions(self, full_path:str) -> bool: + """ In this function, we're changing the XML + document in two times, since we don't want + to change the tree we're iterating on.""" + tree, ns = _parse_xml(full_path) + + # No revisions are present + if tree.find('.//w:del', ns) is None: + return True + elif tree.find('.//w:ins', ns) is None: + return True + + parent_map = {c:p for p in tree.iter( ) for c in p} + + elements = list([element for element in tree.iterfind('.//w:del', ns)]) + for element in elements: + parent_map[element].remove(element) + + elements = list() + for element in tree.iterfind('.//w:ins', ns): + for position, item in enumerate(tree.iter()): + if item == element: + for children in element.iterfind('./*'): + elements.append((element, position, children)) + break + + for (element, position, children) in elements: + parent_map[element].insert(position, children) + parent_map[element].remove(element) + + tree.write(full_path, xml_declaration=True) + + return True + + def _specific_cleanup(self, full_path:str) -> bool: + if full_path.endswith('/word/document.xml'): + return self.__remove_revisions(full_path) + return True + def get_meta(self) -> Dict[str, str]: """ Yes, I know that parsing xml with regexp ain't pretty, @@ -168,27 +229,16 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): def __remove_revisions(self, full_path:str) -> bool: - def parse_map(f): # etree support for ns is a bit rough - ns_map = dict() - for event, (k, v) in ET.iterparse(f, ("start-ns", )): - if event == "start-ns": - ns_map[k] = v - return ns_map - - ns = parse_map(full_path) + tree, ns = _parse_xml(full_path) + if 'office' not in ns.keys(): # no revisions in the current file return True - # Register the namespaces - for k,v in ns.items(): - ET.register_namespace(k, v) - - tree = ET.parse(full_path) for text in tree.getroot().iterfind('.//office:text', ns): for changes in text.iterfind('.//text:tracked-changes', ns): text.remove(changes) - tree.write(full_path, xml_declaration = True) + tree.write(full_path, xml_declaration=True) return True @@ -219,4 +269,3 @@ class LibreOfficeParser(ArchiveBasedAbstractParser): metadata[key] = value zipin.close() return metadata - diff --git a/tests/data/revision.docx b/tests/data/revision.docx new file mode 100644 index 0000000000000000000000000000000000000000..8a2d814cbf0422bee6878ca960587c0413de5f8d GIT binary patch literal 4701 zcmaJ_cQl;a9@U}@!yuwZPb7MYI(mpsTp__IVf4{^x!Mp$j}l!Fy+&s;h)$U3f`ka} zAZnsS%^UZ=d*f!km*1>4>-%H&obT-4+2`zibT#k^uHq0A6XRG0E^6XjDpKsTnH$v6 zU4Z}MyYi`)2EGDW#P%7D($)t@p2&Xm0lT6z`#upoePe3`O(bGh`F&mi(5j1g-uKG) z>kiWx41NAwJ1aU`p{vYJ!BudyRnXpNPDX&md|aT_P&Q#gJ>BGX**rYof=VquvPW~H zR8+{dB$LfrN3Re3n7ytz(e*>)VOo&z^IW0HDf@n7BwoXb>Z!Px-?ds!_zi2*B9rPm z34R)FU#n5`0^d^CBYV*lG9QTTnSY)H8G7e8(R*(ds(g1JNjz+9O5rXvR`LFBC+ZyV z=2f5LsOhKZ;ry-{#=Tr!4I<*z5nPTL?0jYLaBy`0FC!rAFRYv`b={m@+yyLLT=;!p zj@gNCoVr*+%7<qFQYfR+#-KWf=ERd$#DuD6(iS^_I&%<Az^Q)T)|_Zkjm&(*jAkZm zZCjO<4x<>%VBm8L6&CDNht4ttidKK+8dO|X`7+N?GD?GIR43iKKNS)WP)sGhF;Nmd zS}91Lk3X!G<Cv6Hu%FS#85q5}TSD;aq3f7AJ3n!$pb0x2iQAS6Yb&K8=9^hb{o^!& zjf`>)unOZy6^4?oCkSfEvCUW*!Z)_dy;|Uoq5>Di*2Erv**RbgDbHGlnhuE#X7N9H z3-v77s#EJ?lUZcE+YPrB_LnL(MUx_mC0bYb;B4ClBdC4hn*0~v3`Q|={``-RK?e!^ zI?LU&<?m3^{Dc$vSzcOM$t%8}TR!9QJn6X4CB&b3hNYB<*q|y7!G(*1gZE2{DKArO z>Ffr*;5blEOQ%ba4E2?#?EM_)XLF8qDUuOwO+}-dpr!;rW0g!5wbu9`o!LrgNB8i7 zzwd#2mL>(1a--wnqf(N5N(Ef*`zv~tH*M30JB0uhOSdt~h327g=A)u0HMn)Y@Xq7g zz+SbQ5L&%d5*Y<XjA++81Tgu%N=_cQxlD5)TW6TTfnWBBVJA32+9Y&I(wj*<G;fP$ z^pP+v>P3~-C}0^+373o8Mktp1{0}L?Le%kdW&X56k9$DFQs#Gc6b1*~XTP&-E~Tzx zioe0mU)feh=-nx&3LaYq6H^m5hk<wSHYTc+OqcqK9V5nY^F%q965d-U*DT6Jula`t zp_~bHT%ltyh^p@Es;35J5wjdX9wMHY33QO{Y^WSSR-oBv`uCGtB`z(tw*pI#rbCUz z4{2+jCde_NdQ*(VO?|7*w%=wlhQqBrx+8=*3P^*p<>zKPw8fsJ^}@zwVMUGmeb?a9 zkJhPKq}KovDaf6-u0>`SJuBCjS@gvAZ1tBtW5V{#+u6-Z;IG{Rs}N5Z)XC$bbARm| ziCTb8K~VT%4*Sr2(F0Iok<(olo_!pl?8et(Bm>-j{`d$Z?oXX7r-9o-0%m3>agY%r zSZpw_T1ejg#<3zw7w)EUu8L^2wmxk=6Ou}=kRC>Ag8I8oJ-)hp2g00Fc~MET$EJF` zfnC}9H)mLL3)ezVbO^8F-<W%VcUm<VoFe|Dy#@wx2xlxu%P=q`ehhJ9j!;c*Q=r-@ zN$SUy<(E4t4uAnuzPwwUdtJUi+<jUtC!e3Tf}zdHE)G{i6D?lzzM$hsK(M0+J6Y;q z(s8kb|D?km>fvGMWaEB8gmQwGc9%R^_+b`%hD>%aNbdeqgjP=|O)`FSR8mYDeSEKz zqNI#d0pdeluH4DM{=v;}k9&vW9$RjS72Xo`nm|g3{o&A#3>3RU+I=0Y>^MX+<2aY; zS4yYsfgt?=YB6FEiFz?;v~`25gHfGsl98JT#P`*fb}(>of~_w9_1sBqKs~OBXl2zY z7obOvYlvB8RDz=LrHzuX#{!E4(p|Hy-jC_yDwN`hPTq?NvNj=|l{e}g_g6QQZ^W<; znw;T+m!wATtFppi-wUsgtu_~MZ-e@Htn^uHUe$&*NHx!w*3upyM;=gI@MPmL6t;$C zK?}Qd{{L=D^*2w}&Q2cs7LOhO)cUaA8|^MZ(5tUBsoN8F6vXUffDnlXk|SSm8NJ6V zmhUZlG9Di`tNDd6fo@ZY``ds;q`xe!cQLqjcPvBLFm;*$kJ;`1P12*bKosg(C`U6h z-F8jFO{*58d9au7Cd4W+M;eV(6ccJ}(W_vr?+{tY*Lb)Y7M@mRQ7y&d1>0drPinnm zp9u9H#FLa9gy;5uYrPRGV9LVVDiKK_3K}PTulvSLvWOb8)loEJT-%@Dt^LZEwCd(6 zA>LBDJHc=Lyh&w2M5#k-!ax}mkrAS`Whfp*d^^lW6==PnDSp$~S7fcBxpDfpd`)Am zsvc&c?_>5_G!bo%FJH@hZUb(llAu-tHjTBoGR=^18$tam85N7`)?72|9bLcY?@>HH z3qsJ<!8&$5Di)Ea_tSPe-soI#KFjPCD2nCVgy2`4|FbqPi*(pf!+A~u)OKhHlG?M@ zaLek3TGB=X8&@r&aa_rf;zkT=cfaAOJ=#A%Cf~35V(%zAz#goCVEU4|&1kQ(*Yk`; zFM;o2UxSg#nrIYpi6+J*<;|+~*|ZfUB_@3^eFTs?N^Y_T-CxYlYeCzS3{v0Kja!H6 z8!JN$LadzFeXSeD!*i{S^dMKkVQtaJ%wF^l<{26K&WcLKo{t#V@}&3W@hv&Mfo}8U za1A_iR1l00fb_&Njx@eKT2L3NaERNTwasKK7jYrSXH&))3RfKn+PT3$2G=cE#O-Do zPrY~b&FzQUFj0Qs=$DDTHMUwXozH5T0q#InrG-#d2GGwMmOF?%AK0L4csesT3y-7+ zj{<4mZyPOglzN;C8&Fst`1>rLCDNZ6+=k7ln6tBjNH+3m{^(S}AV)GATWVq6o8EY; zN==%)1=|V@k)O<8S}1@I&IqfRs{S|<?&NgjE)Ogey5}@eITp(vRKH|CkbV!ZA10)R zzERiX=cS0j?0M`uYxK%~866SRC>7!D%I*!tM~2%rR5f#av*fJU{}9)xEPN<g*#6p1 zA!(Ujn<vV%=wkvkh1_ZKX@SG1+mPriTQ09|To4Q0;}y1tp^;+}`dqk+=Es~%J`!Qi znC_D2)5`eohskf;K|lge`eVFuxgumEZ=TqzM^Sb#f%2N<t%}{>-6`uyH`F^xTbUnt z^SbX4v;0jrN9U<WcFy=w40_xw{0k5r$SED<Z1u9c+j~|HGWQh;lRnHU88@WbC%txf z%2FkPFc%J!G4VmqBMv|tp%#)+gp@i%Q(ak|idR1vwg6`Qb-RxIiO&{w1<r{wfS&VT zjD6^O5zORE#om;?gb~#fe8Ilw(#x!0WoFSE<Hn29?mdI&`@$>xLAI}>?W8h?=bVGW z-xiYwY|)+$^P&OVV1}`zM(`t31Mdi$8Yage@15D0R7HpOCdc|iqqyGM1Jq0JeN^h_ zID0lq*(WjE8uy!9A&)k5`x6rgpS{mFd6h=tWig$Jj>&DRPQ|>4Rtn~O$QNMF+8oNW zRrg%dPSTpDZW{6~N%lj8L~j>_=lb{Hir?`rBsbBlzb1;0gVRIyE6Gv(z0J9M_&P%W zJPalc=G<llL4n(bK6~0^^hH`~S!7&Y@YK9)ljF9=>4#CgEl-&g+uEMi4EI<Cn^+rN zukrAhlfn3`)I}G^_{MY7(12O93&eI4?y(mIFeD)7EIz}1)bZm(NaAqQ3)pEfxoB&d zLJV1o&6N9O<`vFKsqLdSgx-)ma}-kxy#{<pKC>$8ScGlK1VNl4JZCc`=hP&3La(b` z29x^Gt3n_9LYIB?USfYhBB!WgQ_7=xJETEx8L-wiS6(=QLU~^JWaYV?D*dE8E!l2u ziFk$^8=KV)V`>UX@s?ip)@}koQOf3^S?xxu%po11Dn<Nr2!b2fk3$FK;v!T#F{$Jr zRzIW(57vZVn+LkA0NcLrI1$d@(IzDvLnHL%d)W(Tk=^$`3SOz3ZF(Z}eA&8_qb8V1 zfOX#l|BkTkohgVp*|+-6gr+U0sRCbZ<FKf3MTX4C-)0%KHWb@6#}_nZ7dBcXdEWX> zIFGC{S)s?izIVwEgO|d=4SmSG`;KcuwL&-Xo{*J~XuK=jvsrzy{~5;`MIYPaXug*a z`apxq96-w>3NH&VC?KPTQJ}*YPc=@;VIpuksC={Q?`nz-dNjWLvGFJ2*9^acl}$w| zq4%#_N>+><;I+n2k?%coM6DK(40kH}PP5~0X>}Vq2l#1-eEmbqrBU9Hw{c;)aQ>MO zMLQ{`eb7ZIYW0}^qpBd=XU>i4_IP|G%bL7RTp-W`dRKI^Xz@9~jQa{;DK{V!DIvQC zXuq@R%%0P}iX`d|sI4_$yON@G7xIZMLl2%B-w3=)1(TVQbYl5LXeiiQ9H^$11w0bJ zM=|2B+?B+~kn>6zk<q5b$GbW+nr)xF%sXS1yyR)&a3{?$F*!1J4_=XgQt|X=oUGQe z*GzHZT<(CiR7l&l+wZd^CqkaRLrx@9peD)30&rTy7Bqk}JOR(5oAl$>73modwGq6c zW>x|dK;v7c5b^on&Wmb1cI9;a&4{WWn0H$S*zdH<FT|b&g-kR}%#K{m&<^92{uJ&B zlu9UxVD`>@nXWjghY7;HP*v`U`ZPUORl&bh6*X2>#!6V@fHj(C`o1nucgTg!LM7|y zzy(1)+b6eFs|gX;V}v@cYl7FW?>O2^%X8Fg6;~k_=5N(@(@}e!d9$$vZlBF-lJxkA z?+si7+dHUx<*e}Ha#W5L@y#d&W}3eAY`DFS!q-%FsIh_&Z)LbAGn3(oeC`-gvpi$P zLmj6|v*jw2p0d9cXsN#uOkx*FUvYbj1k(^|#Szd<jh`GN=eL?@vpeuK(_6_ZRa0Wx z;TGQhl5v6T=vdU%>;gjR&7By6m>?}Z1E*J>{kJ5dTa#QG?dk~zedzn2zK+WR=&y5A z1&p{b85>a)cTv=-942-fex9s<-y%Bj-Ex$JfsNoed8aQk&Q7$J;O<}n3Hd-Y4MXVE zr_%4eozJ9(Mx(i8`M-Nh58VtKu|AURY<tq)94qAASq+dq7bp|IVB5uObd3*d8%+PS z4Y+t$aeiC|Tn@z-mjVB-OIHKB8vh1fj(QiF`A1P<hx$1||BSvIIxhUmkMhL&?tjpK zhr^%YmreRbGWk)ruyg!XPWkztF5B3Dzlg+@|6%cehF^AM7hd;AnG^mO{x9eIGyd}N z^6%j%i2jSe?BafgU)IsZ8T6y>VVn6^K>6oH`WbwASzJi%M`e@$PvPlmT)|EX2Zsdv M0b|=l`e)z&17g<I@c;k- literal 0 HcmV?d00001 diff --git a/tests/test_libmat2.py b/tests/test_libmat2.py index 1573790..4df6385 100644 --- a/tests/test_libmat2.py +++ b/tests/test_libmat2.py @@ -121,6 +121,7 @@ class TestRemovingThumbnails(unittest.TestCase): zipin.close() os.remove('./tests/data/clean.cleaned.odt') + os.remove('./tests/data/clean.odt') class TestRevisionsCleaning(unittest.TestCase): @@ -142,6 +143,26 @@ class TestRevisionsCleaning(unittest.TestCase): os.remove('./tests/data/clean.odt') os.remove('./tests/data/clean.cleaned.odt') + def test_msoffice(self): + with zipfile.ZipFile('./tests/data/revision.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' + self.assertIn(r, content) + + shutil.copy('./tests/data/revision.docx', './tests/data/revision_clean.docx') + p = office.MSOfficeParser('./tests/data/revision_clean.docx') + self.assertTrue(p.remove_all()) + + with zipfile.ZipFile('./tests/data/revision_clean.cleaned.docx') as zipin: + c = zipin.open('word/document.xml') + content = c.read() + r = b'<w:ins w:id="1" w:author="Unknown Author" w:date="2018-06-28T23:48:00Z">' + self.assertNotIn(r, content) + + os.remove('./tests/data/revision_clean.docx') + os.remove('./tests/data/revision_clean.cleaned.docx') + class TestDeepCleaning(unittest.TestCase): def __check_deep_meta(self, p): -- GitLab