Skip to content
Snippets Groups Projects
Commit 0170f0e3 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Improve a bit the comments in the code

This is related to the previous commit
parent 0cf0541a
No related branches found
No related tags found
1 merge request!59Remove nsid in MSOffice files
...@@ -44,6 +44,12 @@ def _sort_xml_attributes(full_path: str) -> bool: ...@@ -44,6 +44,12 @@ def _sort_xml_attributes(full_path: str) -> bool:
class MSOfficeParser(ZipParser): class MSOfficeParser(ZipParser):
"""
The methods modifying XML documents are usually doing so in two loops:
1. finding the tag/attributes to remove;
2. actually editing the document
since it's tricky to modify the XML while iterating on it.
"""
mimetypes = { mimetypes = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
...@@ -126,9 +132,6 @@ class MSOfficeParser(ZipParser): ...@@ -126,9 +132,6 @@ class MSOfficeParser(ZipParser):
instead of proper parsing, since rsid can have multiple forms, like instead of proper parsing, since rsid can have multiple forms, like
`rsidRDefault`, `rsidR`, `rsids`, … `rsidRDefault`, `rsidR`, `rsids`, …
We're removing rsid tags in two times, because we can't modify
the xml while we're iterating on it.
For more details, see For more details, see
- https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx - https://msdn.microsoft.com/en-us/library/office/documentformat.openxml.wordprocessing.previoussectionproperties.rsidrpr.aspx
- https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/ - https://blogs.msdn.microsoft.com/brian_jones/2006/12/11/whats-up-with-all-those-rsids/
...@@ -163,15 +166,11 @@ class MSOfficeParser(ZipParser): ...@@ -163,15 +166,11 @@ class MSOfficeParser(ZipParser):
@staticmethod @staticmethod
def __remove_nsid(full_path: str) -> bool: def __remove_nsid(full_path: str) -> bool:
""" """
NSID are random identifiers that can be used nsid are random identifiers that can be used to ease the merging of
to ease the merging of some components of a document. some components of a document. They can also be used for
They can also be used for fingerprinting. fingerprinting.
See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1 See the spec for more details: https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.nsid?view=openxml-2.8.1
In this function, we're changing the XML document in several
different times, since we don't want to change the tree we're currently
iterating on.
""" """
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
...@@ -179,7 +178,7 @@ class MSOfficeParser(ZipParser): ...@@ -179,7 +178,7 @@ class MSOfficeParser(ZipParser):
logging.error("Unable to parse %s: %s", full_path, e) logging.error("Unable to parse %s: %s", full_path, e)
return False return False
# The NSID tag is always under the `w` namespace # The nsid tag is always under the `w` namespace
if 'w' not in namespace.keys(): if 'w' not in namespace.keys():
return True return True
...@@ -197,10 +196,6 @@ class MSOfficeParser(ZipParser): ...@@ -197,10 +196,6 @@ class MSOfficeParser(ZipParser):
@staticmethod @staticmethod
def __remove_revisions(full_path: str) -> bool: def __remove_revisions(full_path: str) -> bool:
""" In this function, we're changing the XML document in several
different times, since we don't want to change the tree we're currently
iterating on.
"""
try: try:
tree, namespace = _parse_xml(full_path) tree, namespace = _parse_xml(full_path)
except ET.ParseError as e: except ET.ParseError as e:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment