diff --git a/libmat2/office.py b/libmat2/office.py index 3a06624f26279a18cc6ea1198dcfca9bbebca3b0..ebf368991dddf06a1286890a4eb99481a7d0272d 100644 --- a/libmat2/office.py +++ b/libmat2/office.py @@ -442,6 +442,11 @@ class MSOfficeParser(ZipParser): class LibreOfficeParser(ZipParser): + """ The methods modifying XML documents are usually doing so in two loops: + 1. finding the tag/attributes to remove; + 2. actually editing the document. + since it's tricky to modify the XML while iterating on it. + """ mimetypes = { 'application/vnd.oasis.opendocument.text', 'application/vnd.oasis.opendocument.spreadsheet', @@ -489,6 +494,50 @@ class LibreOfficeParser(ZipParser): tree.write(full_path, xml_declaration=True) return True + @staticmethod + def __remove_xmlid(full_path: str) -> bool: + """ + xml:id are random identifiers that can be used to ease the merging of + some components of a document. They can also be used for + fingerprinting. + + See the spec for more details: http://officeopenxml.com/WPnumbering.php + """ + try: + tree, namespace = _parse_xml(full_path) + except ET.ParseError as e: # pragma: no cover + logging.error("Unable to parse %s: %s", full_path, e) + return False + + # The id tag appears always after a text:list. + if 'text' not in namespace.keys(): + return True + + parent_map = {c:p for p in tree.iter() for c in p} + + to_randomize = list() + for element in tree.iterfind('.//text:list', namespace): + to_randomize.append(element) + + for uri, ns in ET._namespace_map.items(): + if (ns == 'xml'): + namespace_xml = uri + + assert namespace_xml + + identifier = 0 + for element in to_randomize: + back_elem = element + element.set( + '{' + namespace_xml + '}id', + 'list' + str(random.randint(0, identifier)) + ) + identifier += 1 + parent_map[back_elem] = element + + tree.write(full_path, xml_declaration=True) + return True + def _specific_cleanup(self, full_path: str) -> bool: if os.stat(full_path).st_size == 0: # Don't process empty files return True @@ -497,6 +546,8 @@ class LibreOfficeParser(ZipParser): if os.path.basename(full_path) == 'content.xml': if self.__remove_revisions(full_path) is False: return False + if self.__remove_xmlid(full_path) is False: + return False try: _sort_xml_attributes(full_path) diff --git a/tests/data/dirty_with_xmlid.odt b/tests/data/dirty_with_xmlid.odt new file mode 100644 index 0000000000000000000000000000000000000000..2bbbc7251ca831ef093060784d71d5ba005a5e90 Binary files /dev/null and b/tests/data/dirty_with_xmlid.odt differ diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index aab46c76536ff0d86f6e7c17b36dd0ff2b6e965b..67f6dadf3d78bd7e328877f372b737363e02032b 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -5,6 +5,7 @@ import shutil import os import zipfile import tempfile +import time from libmat2 import office, parser_factory @@ -168,3 +169,37 @@ class TestNsidRemoval(unittest.TestCase): os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx') + +class TestXMLidRandomize(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/dirty_with_xmlid.odt', + './tests/data/clean.odt') + p = office.LibreOfficeParser('./tests/data/clean.odt') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + how_many_rsid = False + with zipfile.ZipFile('./tests/data/clean.odt') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('xml:id') + how_many_rsid += num + self.assertEqual(how_many_rsid, 1) + + ret = p.remove_all() + self.assertTrue(ret) + + num = 0 + with zipfile.ZipFile('./tests/data/clean.cleaned.odt') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num += zin.read(item).decode('utf-8').lower().count('xml:id') + self.assertEqual(num, 1) + + os.remove('./tests/data/clean.odt') + shutil.copyfile('./tests/data/clean.cleaned.odt', + "/home/neha/test.odt") + os.remove('./tests/data/clean.cleaned.odt')