Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
mat2
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
12
Issues
12
List
Boards
Labels
Service Desk
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jvoisin
mat2
Commits
5ac91cd4
Verified
Commit
5ac91cd4
authored
Feb 20, 2019
by
Brolf
Committed by
georg
Mar 05, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Refactor {black,white}list into {block,allow}list
Closes #96
parent
c3f097a8
Pipeline
#23182
failed with stages
in 5 minutes and 13 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
39 additions
and
39 deletions
+39
-39
libmat2/exiftool.py
libmat2/exiftool.py
+2
-2
libmat2/images.py
libmat2/images.py
+4
-4
libmat2/office.py
libmat2/office.py
+3
-3
libmat2/torrent.py
libmat2/torrent.py
+3
-3
libmat2/video.py
libmat2/video.py
+9
-9
libmat2/web.py
libmat2/web.py
+18
-18
No files found.
libmat2/exiftool.py
View file @
5ac91cd4
...
...
@@ -15,14 +15,14 @@ class ExiftoolParser(abstract.AbstractParser):
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_
white
list
=
set
()
# type: Set[str]
meta_
allow
list
=
set
()
# type: Set[str]
def
get_meta
(
self
)
->
Dict
[
str
,
Union
[
str
,
dict
]]:
out
=
subprocess
.
run
([
_get_exiftool_path
(),
'-json'
,
self
.
filename
],
input_filename
=
self
.
filename
,
check
=
True
,
stdout
=
subprocess
.
PIPE
).
stdout
meta
=
json
.
loads
(
out
.
decode
(
'utf-8'
))[
0
]
for
key
in
self
.
meta_
white
list
:
for
key
in
self
.
meta_
allow
list
:
meta
.
pop
(
key
,
None
)
return
meta
...
...
libmat2/images.py
View file @
5ac91cd4
...
...
@@ -15,7 +15,7 @@ assert Set
class
PNGParser
(
exiftool
.
ExiftoolParser
):
mimetypes
=
{
'image/png'
,
}
meta_
white
list
=
{
'SourceFile'
,
'ExifToolVersion'
,
'FileName'
,
meta_
allow
list
=
{
'SourceFile'
,
'ExifToolVersion'
,
'FileName'
,
'Directory'
,
'FileSize'
,
'FileModifyDate'
,
'FileAccessDate'
,
'FileInodeChangeDate'
,
'FilePermissions'
,
'FileType'
,
'FileTypeExtension'
,
...
...
@@ -44,7 +44,7 @@ class PNGParser(exiftool.ExiftoolParser):
class
GIFParser
(
exiftool
.
ExiftoolParser
):
mimetypes
=
{
'image/gif'
}
meta_
white
list
=
{
'AnimationIterations'
,
'BackgroundColor'
,
'BitsPerPixel'
,
meta_
allow
list
=
{
'AnimationIterations'
,
'BackgroundColor'
,
'BitsPerPixel'
,
'ColorResolutionDepth'
,
'Directory'
,
'Duration'
,
'ExifToolVersion'
,
'FileAccessDate'
,
'FileInodeChangeDate'
,
'FileModifyDate'
,
'FileName'
,
...
...
@@ -86,7 +86,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
class
JPGParser
(
GdkPixbufAbstractParser
):
_type
=
'jpeg'
mimetypes
=
{
'image/jpeg'
}
meta_
white
list
=
{
'SourceFile'
,
'ExifToolVersion'
,
'FileName'
,
meta_
allow
list
=
{
'SourceFile'
,
'ExifToolVersion'
,
'FileName'
,
'Directory'
,
'FileSize'
,
'FileModifyDate'
,
'FileAccessDate'
,
"FileInodeChangeDate"
,
'FilePermissions'
,
'FileType'
,
'FileTypeExtension'
,
...
...
@@ -99,7 +99,7 @@ class JPGParser(GdkPixbufAbstractParser):
class
TiffParser
(
GdkPixbufAbstractParser
):
_type
=
'tiff'
mimetypes
=
{
'image/tiff'
}
meta_
white
list
=
{
'Compression'
,
'ExifByteOrder'
,
'ExtraSamples'
,
meta_
allow
list
=
{
'Compression'
,
'ExifByteOrder'
,
'ExtraSamples'
,
'FillOrder'
,
'PhotometricInterpretation'
,
'PlanarConfiguration'
,
'RowsPerStrip'
,
'SamplesPerPixel'
,
'StripByteCounts'
,
'StripOffsets'
,
'BitsPerSample'
,
...
...
libmat2/office.py
View file @
5ac91cd4
...
...
@@ -89,7 +89,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
r
'^word/theme'
,
r
'^word/people\.xml$'
,
# we have a
white
list in self.files_to_keep,
# we have a
n allow
list in self.files_to_keep,
# so we can trash everything else
r
'^word/_rels/'
,
}))
...
...
@@ -100,7 +100,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
def
__fill_files_to_keep_via_content_types
(
self
)
->
bool
:
""" There is a suer-handy `[Content_Types].xml` file
in MS Office archives, describing what each other file contains.
The self.content_types_to_keep member contains a type
white
list,
The self.content_types_to_keep member contains a type
allow
list,
so we're using it to fill the self.files_to_keep one.
"""
with
zipfile
.
ZipFile
(
self
.
filename
)
as
zin
:
...
...
@@ -220,7 +220,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
for
file_to_omit
in
self
.
files_to_omit
:
if
file_to_omit
.
search
(
fname
):
matches
=
map
(
lambda
r
:
r
.
search
(
fname
),
self
.
files_to_keep
)
if
any
(
matches
):
# the file is
whitelisted
if
any
(
matches
):
# the file is
in the allowlist
continue
removed_fnames
.
add
(
fname
)
break
...
...
libmat2/torrent.py
View file @
5ac91cd4
...
...
@@ -6,7 +6,7 @@ from . import abstract
class
TorrentParser
(
abstract
.
AbstractParser
):
mimetypes
=
{
'application/x-bittorrent'
,
}
white
list
=
{
b
'announce'
,
b
'announce-list'
,
b
'info'
}
allow
list
=
{
b
'announce'
,
b
'announce-list'
,
b
'info'
}
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
...
...
@@ -18,14 +18,14 @@ class TorrentParser(abstract.AbstractParser):
def
get_meta
(
self
)
->
Dict
[
str
,
Union
[
str
,
dict
]]:
metadata
=
{}
for
key
,
value
in
self
.
dict_repr
.
items
():
if
key
not
in
self
.
white
list
:
if
key
not
in
self
.
allow
list
:
metadata
[
key
.
decode
(
'utf-8'
)]
=
value
return
metadata
def
remove_all
(
self
)
->
bool
:
cleaned
=
dict
()
for
key
,
value
in
self
.
dict_repr
.
items
():
if
key
in
self
.
white
list
:
if
key
in
self
.
allow
list
:
cleaned
[
key
]
=
value
with
open
(
self
.
output_filename
,
'wb'
)
as
f
:
f
.
write
(
_BencodeHandler
().
bencode
(
cleaned
))
...
...
libmat2/video.py
View file @
5ac91cd4
...
...
@@ -10,10 +10,10 @@ from . import subprocess
class
AbstractFFmpegParser
(
exiftool
.
ExiftoolParser
):
""" Abstract parser for all FFmpeg-based ones, mainly for video. """
# Some fileformats have mandatory metadata fields
meta_key_value_
white
list
=
{}
# type: Dict[str, Union[str, int]]
meta_key_value_
allow
list
=
{}
# type: Dict[str, Union[str, int]]
def
remove_all
(
self
)
->
bool
:
if
self
.
meta_key_value_
white
list
:
if
self
.
meta_key_value_
allow
list
:
logging
.
warning
(
'The format of "%s" (%s) has some mandatory '
'metadata fields; mat2 filled them with standard '
'data.'
,
self
.
filename
,
', '
.
join
(
self
.
mimetypes
))
...
...
@@ -45,8 +45,8 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
ret
=
dict
()
# type: Dict[str, Union[str, dict]]
for
key
,
value
in
meta
.
items
():
if
key
in
self
.
meta_key_value_
white
list
.
keys
():
if
value
==
self
.
meta_key_value_
white
list
[
key
]:
if
key
in
self
.
meta_key_value_
allow
list
.
keys
():
if
value
==
self
.
meta_key_value_
allow
list
[
key
]:
continue
ret
[
key
]
=
value
return
ret
...
...
@@ -54,7 +54,7 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
class
WMVParser
(
AbstractFFmpegParser
):
mimetypes
=
{
'video/x-ms-wmv'
,
}
meta_
white
list
=
{
'AudioChannels'
,
'AudioCodecID'
,
'AudioCodecName'
,
meta_
allow
list
=
{
'AudioChannels'
,
'AudioCodecID'
,
'AudioCodecName'
,
'ErrorCorrectionType'
,
'AudioSampleRate'
,
'DataPackets'
,
'Directory'
,
'Duration'
,
'ExifToolVersion'
,
'FileAccessDate'
,
'FileInodeChangeDate'
,
'FileLength'
,
...
...
@@ -64,7 +64,7 @@ class WMVParser(AbstractFFmpegParser):
'ImageWidth'
,
'MIMEType'
,
'MaxBitrate'
,
'MaxPacketSize'
,
'Megapixels'
,
'MinPacketSize'
,
'Preroll'
,
'SendDuration'
,
'SourceFile'
,
'StreamNumber'
,
'VideoCodecName'
,
}
meta_key_value_
white
list
=
{
# some metadata are mandatory :/
meta_key_value_
allow
list
=
{
# some metadata are mandatory :/
'AudioCodecDescription'
:
''
,
'CreationDate'
:
'0000:00:00 00:00:00Z'
,
'FileID'
:
'00000000-0000-0000-0000-000000000000'
,
...
...
@@ -78,7 +78,7 @@ class WMVParser(AbstractFFmpegParser):
class
AVIParser
(
AbstractFFmpegParser
):
mimetypes
=
{
'video/x-msvideo'
,
}
meta_
white
list
=
{
'SourceFile'
,
'ExifToolVersion'
,
'FileName'
,
'Directory'
,
meta_
allow
list
=
{
'SourceFile'
,
'ExifToolVersion'
,
'FileName'
,
'Directory'
,
'FileSize'
,
'FileModifyDate'
,
'FileAccessDate'
,
'FileInodeChangeDate'
,
'FilePermissions'
,
'FileType'
,
'FileTypeExtension'
,
'MIMEType'
,
'FrameRate'
,
'MaxDataRate'
,
...
...
@@ -98,7 +98,7 @@ class AVIParser(AbstractFFmpegParser):
class
MP4Parser
(
AbstractFFmpegParser
):
mimetypes
=
{
'video/mp4'
,
}
meta_
white
list
=
{
'AudioFormat'
,
'AvgBitrate'
,
'Balance'
,
'TrackDuration'
,
meta_
allow
list
=
{
'AudioFormat'
,
'AvgBitrate'
,
'Balance'
,
'TrackDuration'
,
'XResolution'
,
'YResolution'
,
'ExifToolVersion'
,
'FileAccessDate'
,
'FileInodeChangeDate'
,
'FileModifyDate'
,
'FileName'
,
'FilePermissions'
,
'MIMEType'
,
'FileType'
,
...
...
@@ -109,7 +109,7 @@ class MP4Parser(AbstractFFmpegParser):
'MovieDataSize'
,
'VideoFrameRate'
,
'MediaTimeScale'
,
'SourceImageHeight'
,
'SourceImageWidth'
,
'MatrixStructure'
,
'MediaDuration'
}
meta_key_value_
white
list
=
{
# some metadata are mandatory :/
meta_key_value_
allow
list
=
{
# some metadata are mandatory :/
'CreateDate'
:
'0000:00:00 00:00:00'
,
'CurrentTime'
:
'0 s'
,
'MediaCreateDate'
:
'0000:00:00 00:00:00'
,
...
...
libmat2/web.py
View file @
5ac91cd4
...
...
@@ -37,15 +37,15 @@ class CSSParser(abstract.AbstractParser):
class
AbstractHTMLParser
(
abstract
.
AbstractParser
):
tags_bl
a
cklist
=
set
()
# type: Set[str]
tags_bl
o
cklist
=
set
()
# type: Set[str]
# In some html/xml-based formats some tags are mandatory,
# so we're keeping them, but are discarding their content
tags_required_bl
a
cklist
=
set
()
# type: Set[str]
tags_required_bl
o
cklist
=
set
()
# type: Set[str]
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
self
.
__parser
=
_HTMLParser
(
self
.
filename
,
self
.
tags_bl
a
cklist
,
self
.
tags_required_bl
a
cklist
)
self
.
__parser
=
_HTMLParser
(
self
.
filename
,
self
.
tags_bl
o
cklist
,
self
.
tags_required_bl
o
cklist
)
with
open
(
filename
,
encoding
=
'utf-8'
)
as
f
:
self
.
__parser
.
feed
(
f
.
read
())
self
.
__parser
.
close
()
...
...
@@ -59,13 +59,13 @@ class AbstractHTMLParser(abstract.AbstractParser):
class
HTMLParser
(
AbstractHTMLParser
):
mimetypes
=
{
'text/html'
,
}
tags_bl
a
cklist
=
{
'meta'
,
}
tags_required_bl
a
cklist
=
{
'title'
,
}
tags_bl
o
cklist
=
{
'meta'
,
}
tags_required_bl
o
cklist
=
{
'title'
,
}
class
DTBNCXParser
(
AbstractHTMLParser
):
mimetypes
=
{
'application/x-dtbncx+xml'
,
}
tags_required_bl
a
cklist
=
{
'title'
,
'doctitle'
,
'meta'
}
tags_required_bl
o
cklist
=
{
'title'
,
'doctitle'
,
'meta'
}
class
_HTMLParser
(
parser
.
HTMLParser
):
...
...
@@ -79,7 +79,7 @@ class _HTMLParser(parser.HTMLParser):
Also, gotcha: the `tag` parameters are always in lowercase.
"""
def
__init__
(
self
,
filename
,
bl
acklisted_tags
,
required_bla
cklisted_tags
):
def
__init__
(
self
,
filename
,
bl
ocklisted_tags
,
required_blo
cklisted_tags
):
super
().
__init__
()
self
.
filename
=
filename
self
.
__textrepr
=
''
...
...
@@ -90,24 +90,24 @@ class _HTMLParser(parser.HTMLParser):
self
.
__in_dangerous_but_required_tag
=
0
self
.
__in_dangerous_tag
=
0
if
required_bl
acklisted_tags
&
bla
cklisted_tags
:
# pragma: nocover
if
required_bl
ocklisted_tags
&
blo
cklisted_tags
:
# pragma: nocover
raise
ValueError
(
"There is an overlap between %s and %s"
%
(
required_bl
acklisted_tags
,
bla
cklisted_tags
))
self
.
tag_required_bl
acklist
=
required_bla
cklisted_tags
self
.
tag_bl
acklist
=
bla
cklisted_tags
required_bl
ocklisted_tags
,
blo
cklisted_tags
))
self
.
tag_required_bl
ocklist
=
required_blo
cklisted_tags
self
.
tag_bl
ocklist
=
blo
cklisted_tags
def
handle_starttag
(
self
,
tag
:
str
,
attrs
:
List
[
Tuple
[
str
,
str
]]):
original_tag
=
self
.
get_starttag_text
()
self
.
__validation_queue
.
append
(
original_tag
)
if
tag
in
self
.
tag_bl
a
cklist
:
if
tag
in
self
.
tag_bl
o
cklist
:
self
.
__in_dangerous_tag
+=
1
if
self
.
__in_dangerous_tag
==
0
:
if
self
.
__in_dangerous_but_required_tag
==
0
:
self
.
__textrepr
+=
original_tag
if
tag
in
self
.
tag_required_bl
a
cklist
:
if
tag
in
self
.
tag_required_bl
o
cklist
:
self
.
__in_dangerous_but_required_tag
+=
1
def
handle_endtag
(
self
,
tag
:
str
):
...
...
@@ -123,7 +123,7 @@ class _HTMLParser(parser.HTMLParser):
"tag %s in %s"
%
(
tag
,
previous_tag
,
self
.
filename
))
if
tag
in
self
.
tag_required_bl
a
cklist
:
if
tag
in
self
.
tag_required_bl
o
cklist
:
self
.
__in_dangerous_but_required_tag
-=
1
if
self
.
__in_dangerous_tag
==
0
:
...
...
@@ -131,7 +131,7 @@ class _HTMLParser(parser.HTMLParser):
# There is no `get_endtag_text()` method :/
self
.
__textrepr
+=
'</'
+
previous_tag
+
'>'
if
tag
in
self
.
tag_bl
a
cklist
:
if
tag
in
self
.
tag_bl
o
cklist
:
self
.
__in_dangerous_tag
-=
1
def
handle_data
(
self
,
data
:
str
):
...
...
@@ -141,14 +141,14 @@ class _HTMLParser(parser.HTMLParser):
self
.
__textrepr
+=
escape
(
data
)
def
handle_startendtag
(
self
,
tag
:
str
,
attrs
:
List
[
Tuple
[
str
,
str
]]):
if
tag
in
self
.
tag_required_bl
acklist
|
self
.
tag_bla
cklist
:
if
tag
in
self
.
tag_required_bl
ocklist
|
self
.
tag_blo
cklist
:
meta
=
{
k
:
v
for
k
,
v
in
attrs
}
name
=
meta
.
get
(
'name'
,
'harmful metadata'
)
content
=
meta
.
get
(
'content'
,
'harmful data'
)
self
.
__meta
[
name
]
=
content
if
self
.
__in_dangerous_tag
==
0
:
if
tag
in
self
.
tag_required_bl
a
cklist
:
if
tag
in
self
.
tag_required_bl
o
cklist
:
self
.
__textrepr
+=
'<'
+
tag
+
' />'
return
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment