Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • add-metadata-cleaner-link
  • add_wav_support
  • bak
  • elementary-contract
  • factorize
  • fix-key-error-for-svg-without-xmlns
  • fix_testsuite
  • implement_lightweight_mode_msoffice
  • improve_images
  • improve_svg
  • improve_zip
  • inverted_backup
  • master
  • patch-1
  • pdf_depth
  • please_mypy
  • ppt
  • 0.1.0
  • 0.1.1
  • 0.1.2
  • 0.1.3
  • 0.10.0
  • 0.10.1
  • 0.11.0
  • 0.2.0
  • 0.3.0
  • 0.3.1
  • 0.4.0
  • 0.5.0
  • 0.6.0
  • 0.7.0
  • 0.8.0
  • 0.9.0
33 results

Target

Select target project
  • tguinot/mat2
  • jvoisin/mat2
  • dachary/mat2
  • mejo-/mat2
  • LogicalDash/mat2
  • dkg/mat2
  • christian/mat2
  • Selflike323/mat2
  • fz/mat2
  • iwwmidatlanticgdc/mat2
  • Gu1nn3zz/mat2
  • smagnin/mat2
  • flashcode/mat2
  • MANCASTILLEJA/mat2
  • jboursier/mat2
  • tails/mat2
  • matiargs/mat2
  • Brolf/mat2
  • madaidan/mat2
  • Delmer84/mat2
  • yuebyzua/mat2
  • yyyyyyyan/mat2
  • rmnvgr/mat2
  • Marxism-Leninism/mat2
  • GNUtoo/mat2
  • allexj/mat2
  • b068931cc450442b63f5b3d276ea4297/mat2
  • chenrui/mat2
  • nosec13346/mat2
  • anelki/mat2
30 results
Select Git revision
  • fix_heic
  • master
  • 0.1.0
  • 0.1.1
  • 0.1.2
  • 0.1.3
  • 0.10.0
  • 0.10.1
  • 0.11.0
  • 0.12.0
  • 0.12.1
  • 0.12.2
  • 0.12.3
  • 0.12.4
  • 0.13.0
  • 0.13.1
  • 0.13.2
  • 0.13.3
  • 0.13.4
  • 0.13.5
  • 0.2.0
  • 0.3.0
  • 0.3.1
  • 0.4.0
  • 0.5.0
  • 0.6.0
  • 0.7.0
  • 0.8.0
  • 0.9.0
29 results
Show changes
Commits on Source (6)
......@@ -16,7 +16,7 @@ linting:bandit:
script: # TODO: remove B405 and B314
- bandit ./mat2 --format txt --skip B101
- bandit -r ./nautilus/ --format txt --skip B101
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311
linting:codespell:
image: $CONTAINER_REGISTRY:linting
......
......@@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
The `narrated_powerpoint_presentation.pptx` file is in the public domain.
# Thanks
mat2 wouldn't exist without:
......
......@@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use
return {} # pragma: no cover
def _final_checks(self) -> bool:
""" This method is invoked after the file has been cleaned,
allowing to run final verifications.
"""
# pylint: disable=unused-argument,no-self-use
return True
@staticmethod
@abc.abstractmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
......@@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
if abort:
os.remove(self.output_filename)
return False
if not self._final_checks():
return False # pragma: no cover
return True
......
import random
import uuid
import logging
import os
......@@ -75,6 +76,12 @@ class MSOfficeParser(ZipParser):
def __init__(self, filename):
super().__init__(filename)
#
self.__counters = {
'cNvPr': set(),
'rid': set(),
}
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$',
......@@ -84,8 +91,14 @@ class MSOfficeParser(ZipParser):
r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
r'^(?:word|ppt)/tableStyles\.xml$',
r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
r'^ppt/slides/slide[0-9]*\.xml$',
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r'^(?:word|ppt)/stylesWithEffects\.xml$',
r'^ppt/presentation\.xml$',
# TODO: check if p:bgRef can be randomized
r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
r'^customXml/',
......@@ -95,6 +108,7 @@ class MSOfficeParser(ZipParser):
r'^(?:word|ppt)/theme',
r'^(?:word|ppt)/people\.xml$',
r'^(?:word|ppt)/numbering\.xml$',
r'^(?:word|ppt)/tags/',
# View properties like view mode, last viewed slide etc
r'^(?:word|ppt)/viewProps\.xml$',
# Additional presentation-wide properties like printing properties,
......@@ -146,7 +160,7 @@ class MSOfficeParser(ZipParser):
"""
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError as e:
except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
......@@ -206,7 +220,7 @@ class MSOfficeParser(ZipParser):
def __remove_revisions(full_path: str) -> bool:
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError as e:
except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
......@@ -272,14 +286,74 @@ class MSOfficeParser(ZipParser):
tree.write(full_path, xml_declaration=True)
return True
def _final_checks(self) -> bool:
for k, v in self.__counters.items():
if v and len(v) != max(v):
# TODO: make this an error and return False
# once the ability to correct the counters is implemented
logging.warning("%s contains invalid %s: %s", self.filename, k, v)
return True
return True
def __collect_counters(self, full_path: str):
""" MSOffice documents are using various counters for cross-references,
we collect them all, to make sure that they're effectively counters,
and not unique id used for fingerprinting."""
with open(full_path, encoding='utf-8') as f:
content = f.read()
# relationship id
for i in re.findall(r'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)', content):
self.__counters['rid'].add(int(i))
# Connector non visual property
for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
self.__counters['cNvPr'].add(int(i))
@staticmethod
def __randomize_creationId(full_path: str) -> bool:
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
if 'p14' not in namespace.keys():
return True # pragma: no cover
for item in tree.iterfind('.//p14:creationId', namespace):
item.set('val', '%s' % random.randint(0, 2**32))
tree.write(full_path, xml_declaration=True)
return True
@staticmethod
def __randomize_sldMasterId(full_path: str) -> bool:
try:
tree, namespace = _parse_xml(full_path)
except ET.ParseError as e: # pragma: no cover
logging.error("Unable to parse %s: %s", full_path, e)
return False
if 'p' not in namespace.keys():
return True # pragma: no cover
for item in tree.iterfind('.//p:sldMasterId', namespace):
item.set('id', '%s' % random.randint(0, 2**32))
tree.write(full_path, xml_declaration=True)
return True
def _specific_cleanup(self, full_path: str) -> bool:
# pylint: disable=too-many-return-statements
# pylint: disable=too-many-return-statements,too-many-branches
if os.stat(full_path).st_size == 0: # Don't process empty files
return True
if not full_path.endswith('.xml'):
return True
if self.__randomize_creationId(full_path) is False:
return False
self.__collect_counters(full_path)
if full_path.endswith('/[Content_Types].xml'):
# this file contains references to files that we might
# remove, and MS Office doesn't like dangling references
......@@ -288,7 +362,7 @@ class MSOfficeParser(ZipParser):
elif full_path.endswith('/word/document.xml'):
# this file contains the revisions
if self.__remove_revisions(full_path) is False:
return False
return False # pragma: no cover
elif full_path.endswith('/docProps/app.xml'):
# This file must be present and valid,
# so we're removing as much as we can.
......@@ -310,9 +384,12 @@ class MSOfficeParser(ZipParser):
f.write(b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>')
uid = str(uuid.uuid4()).encode('utf-8')
f.write(b'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>' % uid)
elif full_path.endswith('ppt/presentation.xml'):
if self.__randomize_sldMasterId(full_path) is False:
return False # pragma: no cover
if self.__remove_rsid(full_path) is False:
return False
return False # pragma: no cover
if self.__remove_nsid(full_path) is False:
return False # pragma: no cover
......
File added
......@@ -777,3 +777,16 @@ class TestNoSandbox(unittest.TestCase):
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
os.remove('./tests/data/clean.cleaned.cleaned.png')
class TestComplexOfficeFiles(unittest.TestCase):
def test_complex_pptx(self):
target = './tests/data/clean.pptx'
shutil.copy('./tests/data/narrated_powerpoint_presentation.pptx', target)
p = office.MSOfficeParser(target)
self.assertTrue(p.remove_all())
os.remove(target)
os.remove(p.output_filename)