Commit 174d4a0a authored by jvoisin's avatar jvoisin

Implement rsid stripping for office files

MS Office XML rsid is a "unique identifier used to track the editing session
when the physical character representing this section mark was last formatted."

See the following links for details:
parent fbcf68c2
Pipeline #19076 failed with stages
in 3 minutes and 24 seconds
......@@ -8,6 +8,8 @@ import xml.etree.ElementTree as ET # type: ignore
from .archive import ArchiveBasedAbstractParser
# pylint: disable=line-too-long
# Make pyflakes happy
assert Set
assert Pattern
......@@ -15,14 +17,12 @@ assert Pattern
def _parse_xml(full_path: str):
""" This function parses XML, with namespace support. """
cpt = 0
namespace_map = dict()
for _, (key, value) in ET.iterparse(full_path, ("start-ns", )):
# The ns[0-9]+ namespaces are reserved for interal usage, so
# we have to use an other nomenclature.
if re.match('^ns[0-9]+$', key):
key = 'mat%d' % cpt
cpt += 1
if re.match('^ns[0-9]+$', key, re.I): #pragma: no cover
key = 'mat' + key[2:]
namespace_map[key] = value
ET.register_namespace(key, value)
......@@ -59,11 +59,56 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
files_to_omit = set(map(re.compile, { # type: ignore
def __remove_rsid(full_path: str) -> bool:
""" The method will remove "revision session ID". We're '}rsid'
instead of proper parsing, since rsid can have multiple forms, like
`rsidRDefault`, `rsidR`, `rsids`, …
We're removing rsid tags in two times, because we can't modify
the xml while we're iterating on it.
For more details, see
tree, namespace = _parse_xml(full_path)
except ET.ParseError:
return False
# rsid, tags or attributes, are always under the `w` namespace
if 'w' not in namespace.keys():
return True
parent_map = {c:p for p in tree.iter() for c in p}
elements_to_remove = list()
for item in tree.iterfind('.//', namespace):
if '}rsid' in item.tag.strip().lower(): # resi as tag
for key in list(item.attrib.keys()): # rsid as attribute
if '}rsid' in key.lower():
del item.attrib[key]
for element in elements_to_remove:
tree.write(full_path, xml_declaration=True)
return True
def __remove_revisions(full_path: str) -> bool:
""" In this function, we're changing the XML document in several
......@@ -112,7 +157,13 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
if full_path.endswith('/word/document.xml'):
# this file contains the revisions
return self.__remove_revisions(full_path)
if self.__remove_revisions(full_path) is False:
return False
if full_path.endswith('.xml'):
if self.__remove_rsid(full_path) is False:
return False
return True
def get_meta(self) -> Dict[str, str]:
......@@ -105,3 +105,34 @@ class TestZipOrder(unittest.TestCase):
class TestRsidRemoval(unittest.TestCase):
def test_office(self):
shutil.copy('./tests/data/office_revision_session_ids.docx', './tests/data/clean.docx')
p = office.MSOfficeParser('./tests/data/clean.docx')
meta = p.get_meta()
how_many_rsid = False
with zipfile.ZipFile('./tests/data/clean.docx') as zin:
for item in zin.infolist():
if not item.filename.endswith('.xml'):
num ='utf-8').lower().count('w:rsid')
how_many_rsid += num
self.assertEqual(how_many_rsid, 11)
ret = p.remove_all()
with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin:
for item in zin.infolist():
if not item.filename.endswith('.xml'):
num ='utf-8').lower().count('w:rsid')
self.assertEqual(num, 0)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment