Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
mat2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Romain
mat2
Commits
f33dc0e7
Commit
f33dc0e7
authored
5 years ago
by
Julien (jvoisin) Voisin
Browse files
Options
Downloads
Patches
Plain Diff
Vastly improve ppt compatibility
parent
a23dc001
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
libmat2/archive.py
+8
-0
8 additions, 0 deletions
libmat2/archive.py
libmat2/office.py
+75
-0
75 additions, 0 deletions
libmat2/office.py
with
83 additions
and
0 deletions
libmat2/archive.py
+
8
−
0
View file @
f33dc0e7
...
...
@@ -82,6 +82,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use
return
{}
# pragma: no cover
def
_final_checks
(
self
)
->
bool
:
"""
This method is invoked after the file has been cleaned,
allowing to run final verifications.
"""
return
True
@staticmethod
@abc.abstractmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
List
[
ArchiveMember
]:
...
...
@@ -223,6 +229,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
if
abort
:
os
.
remove
(
self
.
output_filename
)
return
False
if
not
self
.
_final_checks
():
return
False
return
True
...
...
This diff is collapsed.
Click to expand it.
libmat2/office.py
+
75
−
0
View file @
f33dc0e7
import
random
import
uuid
import
logging
import
os
...
...
@@ -75,6 +76,12 @@ class MSOfficeParser(ZipParser):
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
#
self
.
__counters
=
{
'
cNvPr
'
:
set
(),
'
rid
'
:
set
(),
}
self
.
files_to_keep
=
set
(
map
(
re
.
compile
,
{
# type: ignore
r
'
^\[Content_Types\]\.xml$
'
,
r
'
^_rels/\.rels$
'
,
...
...
@@ -84,8 +91,14 @@ class MSOfficeParser(ZipParser):
r
'
^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$
'
,
r
'
^ppt/slideLayouts/slideLayout[0-9]+\.xml$
'
,
r
'
^(?:word|ppt)/tableStyles\.xml$
'
,
r
'
^ppt/slides/_rels/slide[0-9]*\.xml\.rels$
'
,
r
'
^ppt/slides/slide[0-9]*\.xml$
'
,
# https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
r
'
^(?:word|ppt)/stylesWithEffects\.xml$
'
,
r
'
^ppt/presentation\.xml$
'
,
# TODO: check if p:bgRef can be randomized
r
'
^ppt/slideMasters/slideMaster[0-9]+\.xml
'
,
r
'
^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels
'
,
}))
self
.
files_to_omit
=
set
(
map
(
re
.
compile
,
{
# type: ignore
r
'
^customXml/
'
,
...
...
@@ -95,6 +108,7 @@ class MSOfficeParser(ZipParser):
r
'
^(?:word|ppt)/theme
'
,
r
'
^(?:word|ppt)/people\.xml$
'
,
r
'
^(?:word|ppt)/numbering\.xml$
'
,
r
'
^(?:word|ppt)/tags/
'
,
# View properties like view mode, last viewed slide etc
r
'
^(?:word|ppt)/viewProps\.xml$
'
,
# Additional presentation-wide properties like printing properties,
...
...
@@ -272,6 +286,59 @@ class MSOfficeParser(ZipParser):
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
def
_final_checks
(
self
)
->
bool
:
for
k
,
v
in
self
.
__counters
.
items
():
if
len
(
v
)
!=
max
(
v
):
# TODO: make this an error and return False
# once the ability to correct the counters is implemented
logging
.
warning
(
"
%s contains invalid %s: %s
"
,
self
.
filename
,
k
,
v
)
return
True
return
True
def
__collect_counters
(
self
,
full_path
:
str
):
"""
MSOffice documents are using various counters for cross-references,
we collect them all, to make sure that they
'
re effectively counters,
and not unique id used for fingerprinting.
"""
with
open
(
full_path
)
as
f
:
content
=
f
.
read
()
# relationship id
for
i
in
re
.
findall
(
r
'
(?:\s|r:)[iIdD]=
"
rId([0-9]+)
"
(?:\s|/)
'
,
content
):
self
.
__counters
[
'
rid
'
].
add
(
int
(
i
))
# Connector non visual property
for
i
in
re
.
findall
(
r
'
<p:cNvPr id=
"
([0-9]+)
"'
,
content
):
self
.
__counters
[
'
cNvPr
'
].
add
(
int
(
i
))
def
__randomize_creationId
(
self
,
full_path
:
str
)
->
bool
:
try
:
tree
,
namespace
=
_parse_xml
(
full_path
)
except
ET
.
ParseError
as
e
:
logging
.
error
(
"
Unable to parse %s: %s
"
,
full_path
,
e
)
return
False
if
'
p14
'
not
in
namespace
.
keys
():
return
True
for
item
in
tree
.
iterfind
(
'
.//p14:creationId
'
,
namespace
):
item
.
set
(
'
val
'
,
'
%s
'
%
random
.
randint
(
0
,
2
**
32
))
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
def
__randomize_sldMasterId
(
self
,
full_path
:
str
)
->
bool
:
try
:
tree
,
namespace
=
_parse_xml
(
full_path
)
except
ET
.
ParseError
as
e
:
logging
.
error
(
"
Unable to parse %s: %s
"
,
full_path
,
e
)
return
False
if
'
p
'
not
in
namespace
.
keys
():
return
True
for
item
in
tree
.
iterfind
(
'
.//p:sldMasterId
'
,
namespace
):
item
.
set
(
'
id
'
,
'
%s
'
%
random
.
randint
(
0
,
2
**
32
))
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
def
_specific_cleanup
(
self
,
full_path
:
str
)
->
bool
:
# pylint: disable=too-many-return-statements
if
os
.
stat
(
full_path
).
st_size
==
0
:
# Don't process empty files
...
...
@@ -280,6 +347,11 @@ class MSOfficeParser(ZipParser):
if
not
full_path
.
endswith
(
'
.xml
'
):
return
True
if
self
.
__randomize_creationId
(
full_path
)
is
False
:
return
False
self
.
__collect_counters
(
full_path
)
if
full_path
.
endswith
(
'
/[Content_Types].xml
'
):
# this file contains references to files that we might
# remove, and MS Office doesn't like dangling references
...
...
@@ -310,6 +382,9 @@ class MSOfficeParser(ZipParser):
f
.
write
(
b
'
<?xml version=
"
1.0
"
encoding=
"
UTF-8
"
standalone=
"
yes
"
?>
'
)
uid
=
str
(
uuid
.
uuid4
()).
encode
(
'
utf-8
'
)
f
.
write
(
b
'
<a:tblStyleLst def=
"
{%s}
"
xmlns:a=
"
http://schemas.openxmlformats.org/drawingml/2006/main
"
/>
'
%
uid
)
elif
full_path
.
endswith
(
'
ppt/presentation.xml
'
):
if
self
.
__randomize_sldMasterId
(
full_path
)
is
False
:
return
False
if
self
.
__remove_rsid
(
full_path
)
is
False
:
return
False
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment