From 0cf0541ad9c2f40aa987cb34be34bc33b7341232 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 1 Sep 2019 12:54:56 +0200
Subject: [PATCH] Remove nsid fields from MSOffice documents

nsids are random identifiers, usually used to ease merging
between documents, and can trivially be used for fingerprinting.
---
 libmat2/office.py | 50 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/libmat2/office.py b/libmat2/office.py
index b769991..c9bed7a 100644
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -62,9 +62,6 @@ class MSOfficeParser(ZipParser):
 
         # Do we want to keep the following ones?
         'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
-
-        # See https://0xacab.org/jvoisin/mat2/issues/71
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml',  # /word/numbering.xml
     }
 
 
@@ -88,6 +85,7 @@ class MSOfficeParser(ZipParser):
             r'^word/printerSettings/',
             r'^word/theme',
             r'^word/people\.xml$',
+            r'^word/numbering\.xml$',
 
             # we have an allowlist in self.files_to_keep,
             # so we can trash everything else
@@ -124,7 +122,7 @@ class MSOfficeParser(ZipParser):
 
     @staticmethod
     def __remove_rsid(full_path: str) -> bool:
-        """ The method will remove "revision session ID".  We're '}rsid'
+        """ The method will remove "revision session ID".  We're using '}rsid'
         instead of proper parsing, since rsid can have multiple forms, like
         `rsidRDefault`, `rsidR`, `rsids`, …
 
@@ -137,7 +135,8 @@ class MSOfficeParser(ZipParser):
         """
         try:
             tree, namespace = _parse_xml(full_path)
-        except ET.ParseError:
+        except ET.ParseError as e:
+            logging.error("Unable to parse %s: %s", full_path, e)
             return False
 
         # rsid, tags or attributes, are always under the `w` namespace
@@ -161,6 +160,41 @@ class MSOfficeParser(ZipParser):
         tree.write(full_path, xml_declaration=True)
         return True
 
+    @staticmethod
+    def __remove_nsid(full_path: str) -> bool:
+        """
+        NSID are random identifiers that can be used
+        to ease the merging of some components of a document.
+        They can also be used for fingerprinting.
+
+        See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
+
+        In this function, we're changing the XML document in several
+        different times, since we don't want to change the tree we're currently
+        iterating on.
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        # The NSID tag is always under the `w` namespace
+        if 'w' not in namespace.keys():
+            return True
+
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        elements_to_remove = list()
+        for element in tree.iterfind('.//w:nsid', namespace):
+            elements_to_remove.append(element)
+        for element in elements_to_remove:
+            parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True)
+        return True
+
+
     @staticmethod
     def __remove_revisions(full_path: str) -> bool:
         """ In this function, we're changing the XML document in several
@@ -208,7 +242,8 @@ class MSOfficeParser(ZipParser):
         """
         try:
             tree, namespace = _parse_xml(full_path)
-        except ET.ParseError:  # pragma: no cover
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
             return False
 
         if len(namespace.items()) != 1:
@@ -269,6 +304,9 @@ class MSOfficeParser(ZipParser):
         if self.__remove_rsid(full_path) is False:
             return False
 
+        if self.__remove_nsid(full_path) is False:
+            return False  # pragma: no cover
+
         try:
             _sort_xml_attributes(full_path)
         except ET.ParseError as e:  # pragma: no cover
-- 
GitLab