Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
mat2
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
12
Issues
12
List
Boards
Labels
Service Desk
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jvoisin
mat2
Commits
82cc822a
Commit
82cc822a
authored
Apr 27, 2019
by
jvoisin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add tar archive support
parent
20ed5eb7
Pipeline
#24294
passed with stages
in 1 minute and 45 seconds
Changes
5
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
274 additions
and
69 deletions
+274
-69
libmat2/archive.py
libmat2/archive.py
+193
-63
libmat2/epub.py
libmat2/epub.py
+1
-1
libmat2/office.py
libmat2/office.py
+3
-3
tests/test_corrupted_files.py
tests/test_corrupted_files.py
+27
-2
tests/test_libmat2.py
tests/test_libmat2.py
+50
-0
No files found.
libmat2/archive.py
View file @
82cc822a
import
abc
import
zipfile
import
datetime
import
tarfile
import
tempfile
import
os
import
logging
...
...
@@ -11,14 +13,37 @@ from . import abstract, UnknownMemberPolicy, parser_factory
# Make pyflakes happy
assert
Set
assert
Pattern
assert
List
assert
Union
# pylint: disable=not-callable,assignment-from-no-return
# An ArchiveClass is a class representing an archive,
# while an ArchiveMember is a class representing an element
# (usually a file) of an archive.
ArchiveClass
=
Union
[
zipfile
.
ZipFile
,
tarfile
.
TarFile
]
ArchiveMember
=
Union
[
zipfile
.
ZipInfo
,
tarfile
.
TarInfo
]
class
ArchiveBasedAbstractParser
(
abstract
.
AbstractParser
):
""" Office files (.docx, .odt, …) are zipped files. """
"""Base class for all archive-based formats.
Welcome to a world of frustrating complexity and tediouness:
- A lot of file formats (docx, odt, epubs, …) are archive-based,
so we need to add callbacks erverywhere to allow their respective
parsers to apply specific cleanup to the required files.
- Python has two different modules to deal with .tar and .zip files,
with similar-but-yet-o-so-different API, so we need to write
a ghetto-wrapper to avoid duplicating everything
- The combination of @staticmethod and @abstractstaticmethod is
required because for now, mypy doesn't know that
@abstractstaticmethod is, indeed, a static method.
- Mypy is too dumb (yet) to realise that a type A is valid under
the Union[A, B] constrain, hence the weird `# type: ignore`
annotations.
"""
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
self
.
archive_class
=
None
# type: Optional[ArchiveClass]
self
.
member_class
=
None
# type: Optional[ArchiveMember]
# Those are the files that have a format that _isn't_
# supported by MAT2, but that we want to keep anyway.
...
...
@@ -32,10 +57,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# the archive?
self
.
unknown_member_policy
=
UnknownMemberPolicy
.
ABORT
# type: UnknownMemberPolicy
try
:
# better fail here than later
zipfile
.
ZipFile
(
self
.
filename
)
except
zipfile
.
BadZipFile
:
raise
ValueError
self
.
is_archive_valid
()
def
is_archive_valid
(
self
)
:
"""Raise a ValueError is the current archive isn't a valid one."""
def
_specific_cleanup
(
self
,
full_path
:
str
)
->
bool
:
""" This method can be used to apply specific treatment
...
...
@@ -50,59 +75,57 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return
{}
# pragma: no cover
@
staticmethod
def
_clean_zipinfo
(
zipinfo
:
zipfile
.
ZipInfo
)
->
zipfile
.
ZipInfo
:
zipinfo
.
create_system
=
3
# Linux
zipinfo
.
comment
=
b
''
zipinfo
.
date_time
=
(
1980
,
1
,
1
,
0
,
0
,
0
)
# this is as early as a zipfile can be
return
zipinfo
@
abc
.
abstractstaticmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
List
[
ArchiveMember
]:
"""Return all the members of the archive."""
@
staticmethod
def
_get_zipinfo_meta
(
zipinfo
:
zipfile
.
ZipInfo
)
->
Dict
[
str
,
str
]:
metadata
=
{}
if
zipinfo
.
create_system
==
3
:
# this is Linux
pass
elif
zipinfo
.
create_system
==
2
:
metadata
[
'create_system'
]
=
'Windows'
else
:
metadata
[
'create_system'
]
=
'Weird'
@
abc
.
abstractstaticmethod
def
_clean_member
(
member
:
ArchiveMember
)
->
ArchiveMember
:
"""Remove all the metadata for a given member."""
if
zipinfo
.
comment
:
metadata
[
'comment'
]
=
zipinfo
.
comment
# type: ignore
@
staticmethod
@
abc
.
abstractstaticmethod
def
_get_member_meta
(
member
:
ArchiveMember
)
->
Dict
[
str
,
str
]:
"""Return all the metadata of a given member."""
if
zipinfo
.
date_time
!=
(
1980
,
1
,
1
,
0
,
0
,
0
):
metadata
[
'date_time'
]
=
str
(
datetime
.
datetime
(
*
zipinfo
.
date_time
))
@
staticmethod
@
abc
.
abstractstaticmethod
def
_get_member_name
(
member
:
ArchiveMember
)
->
str
:
"""Return the name of the given member."""
return
metadata
@
staticmethod
@
abc
.
abstractstaticmethod
def
_add_file_to_archive
(
archive
:
ArchiveClass
,
member
:
ArchiveMember
,
full_path
:
str
):
"""Add the file at full_path to the archive, via the given member."""
def
get_meta
(
self
)
->
Dict
[
str
,
Union
[
str
,
dict
]]:
meta
=
dict
()
# type: Dict[str, Union[str, dict]]
with
zipfile
.
ZipFile
(
self
.
filename
)
as
zin
:
with
self
.
archive_class
(
self
.
filename
)
as
zin
:
temp_folder
=
tempfile
.
mkdtemp
()
for
item
in
zin
.
infolist
():
local_meta
=
dict
()
# type: Dict[str, Union[str, Dict]]
for
k
,
v
in
self
.
_get_zipinfo_meta
(
item
).
items
():
local_meta
[
k
]
=
v
for
item
in
self
.
_get_all_members
(
zin
):
local_meta
=
self
.
_get_member_meta
(
item
)
member_name
=
self
.
_get_member_name
(
item
)
if
item
.
file
name
[
-
1
]
==
'/'
:
# pragma: no cover
if
member_
name
[
-
1
]
==
'/'
:
# pragma: no cover
# `is_dir` is added in Python3.6
continue
# don't keep empty folders
zin
.
extract
(
member
=
item
,
path
=
temp_folder
)
full_path
=
os
.
path
.
join
(
temp_folder
,
item
.
file
name
)
full_path
=
os
.
path
.
join
(
temp_folder
,
member_
name
)
specific_meta
=
self
.
_specific_get_meta
(
full_path
,
item
.
filename
)
for
(
k
,
v
)
in
specific_meta
.
items
():
local_meta
[
k
]
=
v
specific_meta
=
self
.
_specific_get_meta
(
full_path
,
member_name
)
local_meta
=
{
**
local_meta
,
**
specific_meta
}
tmp_parser
,
_
=
parser_factory
.
get_parser
(
full_path
)
# type: ignore
if
tmp_parser
:
for
k
,
v
in
tmp_parser
.
get_meta
().
items
():
local_meta
[
k
]
=
v
member_parser
,
_
=
parser_factory
.
get_parser
(
full_path
)
# type: ignore
if
member_parser
:
local_meta
=
{
**
local_meta
,
**
member_parser
.
get_meta
()}
if
local_meta
:
meta
[
item
.
file
name
]
=
local_meta
meta
[
member_
name
]
=
local_meta
shutil
.
rmtree
(
temp_folder
)
return
meta
...
...
@@ -110,17 +133,19 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def
remove_all
(
self
)
->
bool
:
# pylint: disable=too-many-branches
with
zipfile
.
ZipFile
(
self
.
filename
)
as
zin
,
\
zipfile
.
ZipFile
(
self
.
output_filename
,
'w'
)
as
zout
:
with
self
.
archive_class
(
self
.
filename
)
as
zin
,
\
self
.
archive_class
(
self
.
output_filename
,
'w'
)
as
zout
:
temp_folder
=
tempfile
.
mkdtemp
()
abort
=
False
items
=
list
()
# type: List[zipfile.ZipInfo]
for
item
in
sorted
(
zin
.
infolist
(),
key
=
lambda
z
:
z
.
filename
):
# Sort the items to process, to reduce fingerprinting,
# and keep them in the `items` variable.
items
=
list
()
# type: List[ArchiveMember]
for
item
in
sorted
(
self
.
_get_all_members
(
zin
),
key
=
self
.
_get_member_name
):
# Some fileformats do require to have the `mimetype` file
# as the first file in the archive.
if
item
.
filename
==
'mimetype'
:
if
self
.
_get_member_name
(
item
)
==
'mimetype'
:
items
=
[
item
]
+
items
else
:
items
.
append
(
item
)
...
...
@@ -128,53 +153,53 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Since files order is a fingerprint factor,
# we're iterating (and thus inserting) them in lexicographic order.
for
item
in
items
:
if
item
.
filename
[
-
1
]
==
'/'
:
# `is_dir` is added in Python3.6
member_name
=
self
.
_get_member_name
(
item
)
if
member_name
[
-
1
]
==
'/'
:
# `is_dir` is added in Python3.6
continue
# don't keep empty folders
zin
.
extract
(
member
=
item
,
path
=
temp_folder
)
full_path
=
os
.
path
.
join
(
temp_folder
,
item
.
file
name
)
full_path
=
os
.
path
.
join
(
temp_folder
,
member_
name
)
if
self
.
_specific_cleanup
(
full_path
)
is
False
:
logging
.
warning
(
"Something went wrong during deep cleaning of %s"
,
item
.
file
name
)
member_
name
)
abort
=
True
continue
if
any
(
map
(
lambda
r
:
r
.
search
(
item
.
file
name
),
self
.
files_to_keep
)):
if
any
(
map
(
lambda
r
:
r
.
search
(
member_
name
),
self
.
files_to_keep
)):
# those files aren't supported, but we want to add them anyway
pass
elif
any
(
map
(
lambda
r
:
r
.
search
(
item
.
file
name
),
self
.
files_to_omit
)):
elif
any
(
map
(
lambda
r
:
r
.
search
(
member_
name
),
self
.
files_to_omit
)):
continue
else
:
# supported files that we want to first clean, then add
tmp
_parser
,
mtype
=
parser_factory
.
get_parser
(
full_path
)
# type: ignore
if
not
tmp
_parser
:
member
_parser
,
mtype
=
parser_factory
.
get_parser
(
full_path
)
# type: ignore
if
not
member
_parser
:
if
self
.
unknown_member_policy
==
UnknownMemberPolicy
.
OMIT
:
logging
.
warning
(
"In file %s, omitting unknown element %s (format: %s)"
,
self
.
filename
,
item
.
file
name
,
mtype
)
self
.
filename
,
member_
name
,
mtype
)
continue
elif
self
.
unknown_member_policy
==
UnknownMemberPolicy
.
KEEP
:
logging
.
warning
(
"In file %s, keeping unknown element %s (format: %s)"
,
self
.
filename
,
item
.
file
name
,
mtype
)
self
.
filename
,
member_
name
,
mtype
)
else
:
logging
.
error
(
"In file %s, element %s's format (%s) "
\
"isn't supported"
,
self
.
filename
,
item
.
file
name
,
mtype
)
self
.
filename
,
member_
name
,
mtype
)
abort
=
True
continue
if
tmp_parser
:
if
tmp
_parser
.
remove_all
()
is
False
:
else
:
if
member
_parser
.
remove_all
()
is
False
:
logging
.
warning
(
"In file %s, something went wrong
\
with the cleaning of %s
\
(format: %s)"
,
self
.
filename
,
item
.
file
name
,
mtype
)
self
.
filename
,
member_
name
,
mtype
)
abort
=
True
continue
os
.
rename
(
tmp
_parser
.
output_filename
,
full_path
)
os
.
rename
(
member
_parser
.
output_filename
,
full_path
)
zinfo
=
zipfile
.
ZipInfo
(
item
.
filename
)
# type: ignore
clean_zinfo
=
self
.
_clean_zipinfo
(
zinfo
)
with
open
(
full_path
,
'rb'
)
as
f
:
zout
.
writestr
(
clean_zinfo
,
f
.
read
())
zinfo
=
self
.
member_class
(
member_name
)
# type: ignore
clean_zinfo
=
self
.
_clean_member
(
zinfo
)
self
.
_add_file_to_archive
(
zout
,
clean_zinfo
,
full_path
)
shutil
.
rmtree
(
temp_folder
)
if
abort
:
...
...
@@ -183,6 +208,111 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
return
True
class
TarParser
(
ArchiveBasedAbstractParser
):
mimetypes
=
{
'application/x-tar'
}
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
self
.
archive_class
=
tarfile
.
TarFile
self
.
member_class
=
tarfile
.
TarInfo
def
is_archive_valid
(
self
):
if
tarfile
.
is_tarfile
(
self
.
filename
)
is
False
:
raise
ValueError
@
staticmethod
def
_clean_member
(
member
:
ArchiveMember
)
->
ArchiveMember
:
assert
isinstance
(
member
,
tarfile
.
TarInfo
)
# please mypy
member
.
mtime
=
member
.
uid
=
member
.
gid
=
0
member
.
uname
=
member
.
gname
=
''
return
member
@
staticmethod
def
_get_member_meta
(
member
:
ArchiveMember
)
->
Dict
[
str
,
str
]:
assert
isinstance
(
member
,
tarfile
.
TarInfo
)
# please mypy
metadata
=
{}
if
member
.
mtime
!=
0
:
metadata
[
'mtime'
]
=
str
(
member
.
mtime
)
if
member
.
uid
!=
0
:
metadata
[
'uid'
]
=
str
(
member
.
uid
)
if
member
.
gid
!=
0
:
metadata
[
'gid'
]
=
str
(
member
.
gid
)
if
member
.
uname
!=
''
:
metadata
[
'uname'
]
=
member
.
uname
if
member
.
gname
!=
''
:
metadata
[
'gname'
]
=
member
.
gname
return
metadata
@
staticmethod
def
_add_file_to_archive
(
archive
:
ArchiveClass
,
member
:
ArchiveMember
,
full_path
:
str
):
assert
isinstance
(
member
,
tarfile
.
TarInfo
)
# please mypy
assert
isinstance
(
archive
,
tarfile
.
TarFile
)
# please mypy
archive
.
add
(
full_path
,
member
.
name
,
filter
=
TarParser
.
_clean_member
)
# type: ignore
@
staticmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
List
[
ArchiveMember
]:
assert
isinstance
(
archive
,
tarfile
.
TarFile
)
# please mypy
return
archive
.
getmembers
()
# type: ignore
@
staticmethod
def
_get_member_name
(
member
:
ArchiveMember
)
->
str
:
assert
isinstance
(
member
,
tarfile
.
TarInfo
)
# please mypy
return
member
.
name
class
ZipParser
(
ArchiveBasedAbstractParser
):
mimetypes
=
{
'application/zip'
}
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
self
.
archive_class
=
zipfile
.
ZipFile
self
.
member_class
=
zipfile
.
ZipInfo
def
is_archive_valid
(
self
):
try
:
zipfile
.
ZipFile
(
self
.
filename
)
except
zipfile
.
BadZipFile
:
raise
ValueError
@
staticmethod
def
_clean_member
(
member
:
ArchiveMember
)
->
ArchiveMember
:
assert
isinstance
(
member
,
zipfile
.
ZipInfo
)
# please mypy
member
.
create_system
=
3
# Linux
member
.
comment
=
b
''
member
.
date_time
=
(
1980
,
1
,
1
,
0
,
0
,
0
)
# this is as early as a zipfile can be
return
member
@
staticmethod
def
_get_member_meta
(
member
:
ArchiveMember
)
->
Dict
[
str
,
str
]:
assert
isinstance
(
member
,
zipfile
.
ZipInfo
)
# please mypy
metadata
=
{}
if
member
.
create_system
==
3
:
# this is Linux
pass
elif
member
.
create_system
==
2
:
metadata
[
'create_system'
]
=
'Windows'
else
:
metadata
[
'create_system'
]
=
'Weird'
if
member
.
comment
:
metadata
[
'comment'
]
=
member
.
comment
# type: ignore
if
member
.
date_time
!=
(
1980
,
1
,
1
,
0
,
0
,
0
):
metadata
[
'date_time'
]
=
str
(
datetime
.
datetime
(
*
member
.
date_time
))
return
metadata
@
staticmethod
def
_add_file_to_archive
(
archive
:
ArchiveClass
,
member
:
ArchiveMember
,
full_path
:
str
):
assert
isinstance
(
archive
,
zipfile
.
ZipFile
)
# please mypy
assert
isinstance
(
member
,
zipfile
.
ZipInfo
)
# please mypy
with
open
(
full_path
,
'rb'
)
as
f
:
archive
.
writestr
(
member
,
f
.
read
())
@
staticmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
List
[
ArchiveMember
]:
assert
isinstance
(
archive
,
zipfile
.
ZipFile
)
# please mypy
return
archive
.
infolist
()
# type: ignore
@
staticmethod
def
_get_member_name
(
member
:
ArchiveMember
)
->
str
:
assert
isinstance
(
member
,
zipfile
.
ZipInfo
)
# please mypy
return
member
.
filename
libmat2/epub.py
View file @
82cc822a
...
...
@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET # type: ignore
from
.
import
archive
,
office
class
EPUBParser
(
archive
.
ArchiveBasedAbstract
Parser
):
class
EPUBParser
(
archive
.
Zip
Parser
):
mimetypes
=
{
'application/epub+zip'
,
}
metadata_namespace
=
'{http://purl.org/dc/elements/1.1/}'
...
...
libmat2/office.py
View file @
82cc822a
...
...
@@ -6,7 +6,7 @@ from typing import Dict, Set, Pattern, Tuple, Any
import
xml.etree.ElementTree
as
ET
# type: ignore
from
.archive
import
ArchiveBasedAbstract
Parser
from
.archive
import
Zip
Parser
# pylint: disable=line-too-long
...
...
@@ -43,7 +43,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
return
True
class
MSOfficeParser
(
ArchiveBasedAbstract
Parser
):
class
MSOfficeParser
(
Zip
Parser
):
mimetypes
=
{
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
,
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
,
...
...
@@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return
{
file_path
:
'harmful content'
,
}
class
LibreOfficeParser
(
ArchiveBasedAbstract
Parser
):
class
LibreOfficeParser
(
Zip
Parser
):
mimetypes
=
{
'application/vnd.oasis.opendocument.text'
,
'application/vnd.oasis.opendocument.spreadsheet'
,
...
...
tests/test_corrupted_files.py
View file @
82cc822a
#!/usr/bin/env python3
import
unittest
import
time
import
shutil
import
os
import
logging
import
zipfile
import
tarfile
from
libmat2
import
pdf
,
images
,
audio
,
office
,
parser_factory
,
torrent
from
libmat2
import
harmless
,
video
,
web
from
libmat2
import
harmless
,
video
,
web
,
archive
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
...
...
@@ -278,7 +280,6 @@ class TestCorruptedFiles(unittest.TestCase):
p
.
remove_all
()
os
.
remove
(
'./tests/data/clean.html'
)
def
test_epub
(
self
):
with
zipfile
.
ZipFile
(
'./tests/data/clean.epub'
,
'w'
)
as
zout
:
zout
.
write
(
'./tests/data/dirty.jpg'
,
'OEBPS/content.opf'
)
...
...
@@ -291,3 +292,27 @@ class TestCorruptedFiles(unittest.TestCase):
self
.
assertFalse
(
p
.
remove_all
())
os
.
remove
(
'./tests/data/clean.epub'
)
def
test_tar
(
self
):
with
tarfile
.
TarFile
(
'./tests/data/clean.tar'
,
'w'
)
as
zout
:
zout
.
add
(
'./tests/data/dirty.flac'
)
zout
.
add
(
'./tests/data/dirty.docx'
)
zout
.
add
(
'./tests/data/dirty.jpg'
)
zout
.
add
(
'./tests/data/embedded_corrupted.docx'
)
tarinfo
=
tarfile
.
TarInfo
(
name
=
'./tests/data/dirty.png'
)
tarinfo
.
mtime
=
time
.
time
()
tarinfo
.
uid
=
1337
tarinfo
.
gid
=
1338
with
open
(
'./tests/data/dirty.png'
,
'rb'
)
as
f
:
zout
.
addfile
(
tarinfo
,
f
)
p
,
mimetype
=
parser_factory
.
get_parser
(
'./tests/data/clean.tar'
)
self
.
assertEqual
(
mimetype
,
'application/x-tar'
)
meta
=
p
.
get_meta
()
self
.
assertEqual
(
meta
[
'./tests/data/dirty.flac'
][
'comments'
],
'Thank you for using MAT !'
)
self
.
assertEqual
(
meta
[
'./tests/data/dirty.docx'
][
'word/media/image1.png'
][
'Comment'
],
'This is a comment, be careful!'
)
self
.
assertFalse
(
p
.
remove_all
())
os
.
remove
(
'./tests/data/clean.tar'
)
shutil
.
copy
(
'./tests/data/dirty.png'
,
'./tests/data/clean.tar'
)
with
self
.
assertRaises
(
ValueError
):
archive
.
TarParser
(
'./tests/data/clean.tar'
)
os
.
remove
(
'./tests/data/clean.tar'
)
tests/test_libmat2.py
View file @
82cc822a
...
...
@@ -4,6 +4,8 @@ import unittest
import
shutil
import
os
import
re
import
tarfile
import
tempfile
import
zipfile
from
libmat2
import
pdf
,
images
,
audio
,
office
,
parser_factory
,
torrent
,
harmless
...
...
@@ -195,6 +197,19 @@ class TestGetMeta(unittest.TestCase):
self
.
assertEqual
(
meta
[
'version'
],
'1.0'
)
self
.
assertEqual
(
meta
[
'harmful data'
],
'underline is cool'
)
def
test_tar
(
self
):
with
tarfile
.
TarFile
(
'./tests/data/dirty.tar'
,
'w'
)
as
tout
:
tout
.
add
(
'./tests/data/dirty.flac'
)
tout
.
add
(
'./tests/data/dirty.docx'
)
tout
.
add
(
'./tests/data/dirty.jpg'
)
p
,
mimetype
=
parser_factory
.
get_parser
(
'./tests/data/dirty.tar'
)
self
.
assertEqual
(
mimetype
,
'application/x-tar'
)
meta
=
p
.
get_meta
()
self
.
assertEqual
(
meta
[
'./tests/data/dirty.flac'
][
'comments'
],
'Thank you for using MAT !'
)
self
.
assertEqual
(
meta
[
'./tests/data/dirty.docx'
][
'word/media/image1.png'
][
'Comment'
],
'This is a comment, be careful!'
)
os
.
remove
(
'./tests/data/dirty.tar'
)
class
TestRemovingThumbnails
(
unittest
.
TestCase
):
def
test_odt
(
self
):
shutil
.
copy
(
'./tests/data/revision.odt'
,
'./tests/data/clean.odt'
)
...
...
@@ -702,3 +717,38 @@ class TestCleaning(unittest.TestCase):
os
.
remove
(
'./tests/data/clean.css'
)
os
.
remove
(
'./tests/data/clean.cleaned.css'
)
os
.
remove
(
'./tests/data/clean.cleaned.cleaned.css'
)
def
test_tar
(
self
):
with
tarfile
.
TarFile
(
'./tests/data/dirty.tar'
,
'w'
)
as
zout
:
zout
.
add
(
'./tests/data/dirty.flac'
)
zout
.
add
(
'./tests/data/dirty.docx'
)
zout
.
add
(
'./tests/data/dirty.jpg'
)
p
=
archive
.
TarParser
(
'./tests/data/dirty.tar'
)
meta
=
p
.
get_meta
()
self
.
assertEqual
(
meta
[
'./tests/data/dirty.docx'
][
'word/media/image1.png'
][
'Comment'
],
'This is a comment, be careful!'
)
ret
=
p
.
remove_all
()
self
.
assertTrue
(
ret
)
p
=
archive
.
TarParser
(
'./tests/data/dirty.cleaned.tar'
)
self
.
assertEqual
(
p
.
get_meta
(),
{})
self
.
assertTrue
(
p
.
remove_all
())
tmp_dir
=
tempfile
.
mkdtemp
()
with
tarfile
.
open
(
'./tests/data/dirty.cleaned.tar'
)
as
zout
:
zout
.
extractall
(
path
=
tmp_dir
)
zout
.
close
()
number_of_files
=
0
for
root
,
_
,
fnames
in
os
.
walk
(
tmp_dir
):
for
f
in
fnames
:
complete_path
=
os
.
path
.
join
(
root
,
f
)
p
,
_
=
parser_factory
.
get_parser
(
complete_path
)
self
.
assertIsNotNone
(
p
)
self
.
assertEqual
(
p
.
get_meta
(),
{})
number_of_files
+=
1
self
.
assertEqual
(
number_of_files
,
3
)
os
.
remove
(
'./tests/data/dirty.tar'
)
os
.
remove
(
'./tests/data/dirty.cleaned.tar'
)
os
.
remove
(
'./tests/data/dirty.cleaned.cleaned.tar'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment