Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
mat2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Monitor
Service Desk
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
jvoisin
mat2
Commits
02f7605a
Commit
02f7605a
authored
6 years ago
by
Julien (jvoisin) Voisin
Browse files
Options
Downloads
Patches
Plain Diff
MAT2 is now cleaning revisions from odt files!
parent
80fc4ffb
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
.gitlab-ci.yml
+2
-2
2 additions, 2 deletions
.gitlab-ci.yml
libmat2/office.py
+62
-14
62 additions, 14 deletions
libmat2/office.py
tests/test_libmat2.py
+21
-0
21 additions, 0 deletions
tests/test_libmat2.py
with
85 additions
and
16 deletions
.gitlab-ci.yml
+
2
−
2
View file @
02f7605a
...
...
@@ -6,10 +6,10 @@ stages:
bandit
:
stage
:
linting
script
:
script
:
# TODO: remove B405 and B314
-
apt-get -qqy update
-
apt-get -qqy install --no-install-recommends python3-bandit
-
bandit -r ./libmat2 --format txt --skip B101,B404,B603
-
bandit -r ./libmat2 --format txt --skip B101,B404,B603
,B405,B314
pyflakes
:
stage
:
linting
...
...
This diff is collapsed.
Click to expand it.
libmat2/office.py
+
62
−
14
View file @
02f7605a
...
...
@@ -4,8 +4,10 @@ import shutil
import
tempfile
import
datetime
import
zipfile
import
xml.etree.ElementTree
as
ET
from
typing
import
Dict
,
Set
,
Pattern
from
.
import
abstract
,
parser_factory
# Make pyflakes happy
...
...
@@ -13,7 +15,12 @@ assert Set
assert
Pattern
class
ArchiveBasedAbstractParser
(
abstract
.
AbstractParser
):
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
files_to_keep
=
set
()
# type: Set[str]
# Those are the files that we _do not_ want to keep,
# no matter if they are supported or not.
files_to_omit
=
set
()
# type: Set[Pattern]
def
__init__
(
self
,
filename
):
...
...
@@ -23,6 +30,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
except
zipfile
.
BadZipFile
:
raise
ValueError
def
_specific_cleanup
(
self
,
full_path
:
str
)
->
bool
:
"""
This method can be used to apply specific treatment
to files present in the archive.
"""
return
True
def
_clean_zipinfo
(
self
,
zipinfo
:
zipfile
.
ZipInfo
)
->
zipfile
.
ZipInfo
:
zipinfo
.
create_system
=
3
# Linux
zipinfo
.
comment
=
b
''
...
...
@@ -56,26 +68,31 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
for
item
in
zin
.
infolist
():
if
item
.
filename
[
-
1
]
==
'
/
'
:
# `is_dir` is added in Python3.6
continue
# don't keep empty folders
elif
item
.
filename
in
self
.
files_to_keep
:
item
=
self
.
_clean_zipinfo
(
item
)
zout
.
writestr
(
item
,
zin
.
read
(
item
))
continue
elif
any
(
map
(
lambda
r
:
r
.
search
(
item
.
filename
),
self
.
files_to_omit
)):
continue
zin
.
extract
(
member
=
item
,
path
=
temp_folder
)
full_path
=
os
.
path
.
join
(
temp_folder
,
item
.
filename
)
tmp_parser
,
mtype
=
parser_factory
.
get_parser
(
full_path
)
# type: ignore
if
not
tmp_parser
:
shutil
.
rmtree
(
temp_folder
)
os
.
remove
(
self
.
output_filename
)
print
(
"
%s
'
s format (%s) isn
'
t supported
"
%
(
item
.
filename
,
mtype
))
return
False
tmp_parser
.
remove_all
()
self
.
_specific_cleanup
(
full_path
)
if
item
.
filename
in
self
.
files_to_keep
:
# those files aren't supported, but we want to add them anyway
pass
elif
any
(
map
(
lambda
r
:
r
.
search
(
item
.
filename
),
self
.
files_to_omit
)):
continue
else
:
# supported files that we want to clean then add
tmp_parser
,
mtype
=
parser_factory
.
get_parser
(
full_path
)
# type: ignore
if
not
tmp_parser
:
shutil
.
rmtree
(
temp_folder
)
os
.
remove
(
self
.
output_filename
)
print
(
"
%s
'
s format (%s) isn
'
t supported
"
%
(
item
.
filename
,
mtype
))
return
False
tmp_parser
.
remove_all
()
os
.
rename
(
tmp_parser
.
output_filename
,
full_path
)
zinfo
=
zipfile
.
ZipInfo
(
item
.
filename
)
# type: ignore
clean_zinfo
=
self
.
_clean_zipinfo
(
zinfo
)
with
open
(
tmp_parser
.
output_filename
,
'
rb
'
)
as
f
:
with
open
(
full_path
,
'
rb
'
)
as
f
:
zout
.
writestr
(
clean_zinfo
,
f
.
read
())
shutil
.
rmtree
(
temp_folder
)
...
...
@@ -149,6 +166,37 @@ class LibreOfficeParser(ArchiveBasedAbstractParser):
'
^Thumbnails/
'
,
}))
def
__remove_revisions
(
self
,
full_path
:
str
)
->
bool
:
def
parse_map
(
f
):
# etree support for ns is a bit rough
ns_map
=
dict
()
for
event
,
(
k
,
v
)
in
ET
.
iterparse
(
f
,
(
"
start-ns
"
,
)):
if
event
==
"
start-ns
"
:
ns_map
[
k
]
=
v
return
ns_map
ns
=
parse_map
(
full_path
)
if
'
office
'
not
in
ns
.
keys
():
# no revisions in the current file
return
True
# Register the namespaces
for
k
,
v
in
ns
.
items
():
ET
.
register_namespace
(
k
,
v
)
tree
=
ET
.
parse
(
full_path
)
for
text
in
tree
.
getroot
().
iterfind
(
'
.//office:text
'
,
ns
):
for
changes
in
text
.
iterfind
(
'
.//text:tracked-changes
'
,
ns
):
text
.
remove
(
changes
)
tree
.
write
(
full_path
,
xml_declaration
=
True
)
return
True
def
_specific_cleanup
(
self
,
full_path
:
str
)
->
bool
:
if
os
.
path
.
basename
(
full_path
)
==
'
content.xml
'
:
return
self
.
__remove_revisions
(
full_path
)
return
True
def
get_meta
(
self
)
->
Dict
[
str
,
str
]:
"""
Yes, I know that parsing xml with regexp ain
'
t pretty,
...
...
This diff is collapsed.
Click to expand it.
tests/test_libmat2.py
+
21
−
0
View file @
02f7605a
...
...
@@ -122,6 +122,27 @@ class TestRemovingThumbnails(unittest.TestCase):
os
.
remove
(
'
./tests/data/clean.cleaned.odt
'
)
class
TestRevisionsCleaning
(
unittest
.
TestCase
):
def
test_libreoffice
(
self
):
with
zipfile
.
ZipFile
(
'
./tests/data/revision.odt
'
)
as
zipin
:
c
=
zipin
.
open
(
'
content.xml
'
)
r
=
c
.
read
()
self
.
assertIn
(
b
'
tracked-changes
'
,
r
)
shutil
.
copy
(
'
./tests/data/revision.odt
'
,
'
./tests/data/clean.odt
'
)
p
=
office
.
LibreOfficeParser
(
'
./tests/data/clean.odt
'
)
self
.
assertTrue
(
p
.
remove_all
())
with
zipfile
.
ZipFile
(
'
./tests/data/clean.cleaned.odt
'
)
as
zipin
:
c
=
zipin
.
open
(
'
content.xml
'
)
r
=
c
.
read
()
self
.
assertNotIn
(
b
'
tracked-changes
'
,
r
)
os
.
remove
(
'
./tests/data/clean.odt
'
)
os
.
remove
(
'
./tests/data/clean.cleaned.odt
'
)
class
TestDeepCleaning
(
unittest
.
TestCase
):
def
__check_deep_meta
(
self
,
p
):
tempdir
=
tempfile
.
mkdtemp
()
...
...
This diff is collapsed.
Click to expand it.
jvoisin
@jvoisin
mentioned in issue
#39 (closed)
·
6 years ago
mentioned in issue
#39 (closed)
mentioned in issue #39
Toggle commit list
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment