Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jvoisin
mat2
Commits
d7a03d90
Commit
d7a03d90
authored
Mar 08, 2020
by
jvoisin
Browse files
Vastly improve ppt compatibility
parent
a23dc001
Changes
6
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
d7a03d90
...
...
@@ -16,7 +16,7 @@ linting:bandit:
script
:
# TODO: remove B405 and B314
-
bandit ./mat2 --format txt --skip B101
-
bandit -r ./nautilus/ --format txt --skip B101
-
bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
-
bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
,B311
linting:codespell:
image
:
$CONTAINER_REGISTRY:linting
...
...
README.md
View file @
d7a03d90
...
...
@@ -152,6 +152,8 @@ Copyright 2016 Marie-Rose for mat2's logo
The
`tests/data/dirty_with_nsid.docx`
file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
The
`narrated_powerpoint_presentation.pptx`
file is in the public domain.
# Thanks
mat2 wouldn't exist without:
...
...
libmat2/archive.py
View file @
d7a03d90
...
...
@@ -82,6 +82,13 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use
return
{}
# pragma: no cover
def
_final_checks
(
self
)
->
bool
:
""" This method is invoked after the file has been cleaned,
allowing to run final verifications.
"""
# pylint: disable=unused-argument,no-self-use
return
True
@
staticmethod
@
abc
.
abstractmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
List
[
ArchiveMember
]:
...
...
@@ -223,6 +230,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
if
abort
:
os
.
remove
(
self
.
output_filename
)
return
False
if
not
self
.
_final_checks
():
return
False
# pragma: no cover
return
True
...
...
libmat2/office.py
View file @
d7a03d90
import
random
import
uuid
import
logging
import
os
...
...
@@ -75,6 +76,14 @@ class MSOfficeParser(ZipParser):
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
# MSOffice documents are using various counters for cross-references,
# we collect them all, to make sure that they're effectively counters,
# and not unique id used for fingerprinting.
self
.
__counters
=
{
'cNvPr'
:
set
(),
'rid'
:
set
(),
}
self
.
files_to_keep
=
set
(
map
(
re
.
compile
,
{
# type: ignore
r
'^\[Content_Types\]\.xml$'
,
r
'^_rels/\.rels$'
,
...
...
@@ -84,8 +93,14 @@ class MSOfficeParser(ZipParser):
r
'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$'
,
r
'^ppt/slideLayouts/slideLayout[0-9]+\.xml$'
,
r
'^(?:word|ppt)/tableStyles\.xml$'
,
r
'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$'
,
r
'^ppt/slides/slide[0-9]*\.xml$'
,
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r
'^(?:word|ppt)/stylesWithEffects\.xml$'
,
r
'^ppt/presentation\.xml$'
,
# TODO: check if p:bgRef can be randomized
r
'^ppt/slideMasters/slideMaster[0-9]+\.xml'
,
r
'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels'
,
}))
self
.
files_to_omit
=
set
(
map
(
re
.
compile
,
{
# type: ignore
r
'^customXml/'
,
...
...
@@ -95,6 +110,7 @@ class MSOfficeParser(ZipParser):
r
'^(?:word|ppt)/theme'
,
r
'^(?:word|ppt)/people\.xml$'
,
r
'^(?:word|ppt)/numbering\.xml$'
,
r
'^(?:word|ppt)/tags/'
,
# View properties like view mode, last viewed slide etc
r
'^(?:word|ppt)/viewProps\.xml$'
,
# Additional presentation-wide properties like printing properties,
...
...
@@ -146,7 +162,7 @@ class MSOfficeParser(ZipParser):
"""
try
:
tree
,
namespace
=
_parse_xml
(
full_path
)
except
ET
.
ParseError
as
e
:
except
ET
.
ParseError
as
e
:
# pragma: no cover
logging
.
error
(
"Unable to parse %s: %s"
,
full_path
,
e
)
return
False
...
...
@@ -206,7 +222,7 @@ class MSOfficeParser(ZipParser):
def
__remove_revisions
(
full_path
:
str
)
->
bool
:
try
:
tree
,
namespace
=
_parse_xml
(
full_path
)
except
ET
.
ParseError
as
e
:
except
ET
.
ParseError
as
e
:
# pragma: no cover
logging
.
error
(
"Unable to parse %s: %s"
,
full_path
,
e
)
return
False
...
...
@@ -272,14 +288,71 @@ class MSOfficeParser(ZipParser):
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
def
_final_checks
(
self
)
->
bool
:
for
k
,
v
in
self
.
__counters
.
items
():
if
v
and
len
(
v
)
!=
max
(
v
):
# TODO: make this an error and return False
# once the ability to correct the counters is implemented
logging
.
warning
(
"%s contains invalid %s: %s"
,
self
.
filename
,
k
,
v
)
return
True
return
True
def
__collect_counters
(
self
,
full_path
:
str
):
with
open
(
full_path
,
encoding
=
'utf-8'
)
as
f
:
content
=
f
.
read
()
# "relationship Id"
for
i
in
re
.
findall
(
r
'(?:\s|r:)[iI][dD]="rId([0-9]+)"(?:\s|/)'
,
content
):
self
.
__counters
[
'rid'
].
add
(
int
(
i
))
# "connector for Non-visual property"
for
i
in
re
.
findall
(
r
'<p:cNvPr id="([0-9]+)"'
,
content
):
self
.
__counters
[
'cNvPr'
].
add
(
int
(
i
))
@
staticmethod
def
__randomize_creationId
(
full_path
:
str
)
->
bool
:
try
:
tree
,
namespace
=
_parse_xml
(
full_path
)
except
ET
.
ParseError
as
e
:
# pragma: no cover
logging
.
error
(
"Unable to parse %s: %s"
,
full_path
,
e
)
return
False
if
'p14'
not
in
namespace
.
keys
():
return
True
# pragma: no cover
for
item
in
tree
.
iterfind
(
'.//p14:creationId'
,
namespace
):
item
.
set
(
'val'
,
'%s'
%
random
.
randint
(
0
,
2
**
32
))
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
@
staticmethod
def
__randomize_sldMasterId
(
full_path
:
str
)
->
bool
:
try
:
tree
,
namespace
=
_parse_xml
(
full_path
)
except
ET
.
ParseError
as
e
:
# pragma: no cover
logging
.
error
(
"Unable to parse %s: %s"
,
full_path
,
e
)
return
False
if
'p'
not
in
namespace
.
keys
():
return
True
# pragma: no cover
for
item
in
tree
.
iterfind
(
'.//p:sldMasterId'
,
namespace
):
item
.
set
(
'id'
,
'%s'
%
random
.
randint
(
0
,
2
**
32
))
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
def
_specific_cleanup
(
self
,
full_path
:
str
)
->
bool
:
# pylint: disable=too-many-return-statements
# pylint: disable=too-many-return-statements
,too-many-branches
if
os
.
stat
(
full_path
).
st_size
==
0
:
# Don't process empty files
return
True
if
not
full_path
.
endswith
(
'.xml'
):
return
True
if
self
.
__randomize_creationId
(
full_path
)
is
False
:
return
False
self
.
__collect_counters
(
full_path
)
if
full_path
.
endswith
(
'/[Content_Types].xml'
):
# this file contains references to files that we might
# remove, and MS Office doesn't like dangling references
...
...
@@ -288,7 +361,7 @@ class MSOfficeParser(ZipParser):
elif
full_path
.
endswith
(
'/word/document.xml'
):
# this file contains the revisions
if
self
.
__remove_revisions
(
full_path
)
is
False
:
return
False
return
False
# pragma: no cover
elif
full_path
.
endswith
(
'/docProps/app.xml'
):
# This file must be present and valid,
# so we're removing as much as we can.
...
...
@@ -310,9 +383,12 @@ class MSOfficeParser(ZipParser):
f
.
write
(
b
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
)
uid
=
str
(
uuid
.
uuid4
()).
encode
(
'utf-8'
)
f
.
write
(
b
'<a:tblStyleLst def="{%s}" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>'
%
uid
)
elif
full_path
.
endswith
(
'ppt/presentation.xml'
):
if
self
.
__randomize_sldMasterId
(
full_path
)
is
False
:
return
False
# pragma: no cover
if
self
.
__remove_rsid
(
full_path
)
is
False
:
return
False
return
False
# pragma: no cover
if
self
.
__remove_nsid
(
full_path
)
is
False
:
return
False
# pragma: no cover
...
...
tests/data/narrated_powerpoint_presentation.pptx
0 → 100644
View file @
d7a03d90
File added
tests/test_libmat2.py
View file @
d7a03d90
...
...
@@ -777,3 +777,13 @@ class TestNoSandbox(unittest.TestCase):
os
.
remove
(
'./tests/data/clean.png'
)
os
.
remove
(
'./tests/data/clean.cleaned.png'
)
os
.
remove
(
'./tests/data/clean.cleaned.cleaned.png'
)
class
TestComplexOfficeFiles
(
unittest
.
TestCase
):
def
test_complex_pptx
(
self
):
target
=
'./tests/data/clean.pptx'
shutil
.
copy
(
'./tests/data/narrated_powerpoint_presentation.pptx'
,
target
)
p
=
office
.
MSOfficeParser
(
target
)
self
.
assertTrue
(
p
.
remove_all
())
os
.
remove
(
target
)
os
.
remove
(
p
.
output_filename
)
jvoisin
@jvoisin
mentioned in issue
#126 (closed)
·
Mar 08, 2020
mentioned in issue
#126 (closed)
mentioned in issue #126
Toggle commit list
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment