Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jvoisin
mat2
Commits
cc5be860
Commit
cc5be860
authored
Aug 28, 2022
by
jvoisin
Browse files
Simplify the typing annotations
parent
292f44c0
Pipeline
#110347
failed with stages
in 35 seconds
Changes
15
Pipelines
8
Hide whitespace changes
Inline
Side-by-side
libmat2/__init__.py
View file @
cc5be860
...
...
@@ -2,12 +2,11 @@
import
enum
import
importlib
from
typing
import
Dict
,
Optional
,
Union
from
typing
import
Optional
,
Union
from
.
import
exiftool
,
video
# make pyflakes happy
assert
Dict
assert
Optional
assert
Union
...
...
@@ -67,8 +66,8 @@ CMD_DEPENDENCIES = {
},
}
def
check_dependencies
()
->
D
ict
[
str
,
D
ict
[
str
,
bool
]]:
ret
=
dict
()
# type:
D
ict[str, dict]
def
check_dependencies
()
->
d
ict
[
str
,
d
ict
[
str
,
bool
]]:
ret
=
dict
()
# type:
d
ict[str, dict]
for
key
,
value
in
DEPENDENCIES
.
items
():
ret
[
key
]
=
{
...
...
libmat2/abstract.py
View file @
cc5be860
import
abc
import
os
import
re
from
typing
import
Set
,
Dict
,
Union
assert
Set
# make pyflakes happy
from
typing
import
Union
class
AbstractParser
(
abc
.
ABC
):
...
...
@@ -11,8 +9,8 @@ class AbstractParser(abc.ABC):
It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
"""
meta_list
=
set
()
# type:
S
et[str]
mimetypes
=
set
()
# type:
S
et[str]
meta_list
=
set
()
# type:
s
et[str]
mimetypes
=
set
()
# type:
s
et[str]
def
__init__
(
self
,
filename
:
str
)
->
None
:
"""
...
...
@@ -35,7 +33,7 @@ class AbstractParser(abc.ABC):
self
.
sandbox
=
True
@
abc
.
abstractmethod
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
"""Return all the metadata of the current file"""
@
abc
.
abstractmethod
...
...
libmat2/archive.py
View file @
cc5be860
...
...
@@ -7,12 +7,11 @@ import tempfile
import
os
import
logging
import
shutil
from
typing
import
Dict
,
Set
,
Pattern
,
Union
,
Any
,
List
from
typing
import
Pattern
,
Union
,
Any
from
.
import
abstract
,
UnknownMemberPolicy
,
parser_factory
# Make pyflakes happy
assert
Set
assert
Pattern
# pylint: disable=not-callable,assignment-from-no-return,too-many-branches
...
...
@@ -53,11 +52,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Those are the files that have a format that _isn't_
# supported by mat2, but that we want to keep anyway.
self
.
files_to_keep
=
set
()
# type:
S
et[Pattern]
self
.
files_to_keep
=
set
()
# type:
s
et[Pattern]
# Those are the files that we _do not_ want to keep,
# no matter if they are supported or not.
self
.
files_to_omit
=
set
()
# type:
S
et[Pattern]
self
.
files_to_omit
=
set
()
# type:
s
et[Pattern]
# what should the parser do if it encounters an unknown file in
# the archive?
...
...
@@ -76,7 +75,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use
return
True
# pragma: no cover
def
_specific_get_meta
(
self
,
full_path
:
str
,
file_path
:
str
)
->
D
ict
[
str
,
Any
]:
def
_specific_get_meta
(
self
,
full_path
:
str
,
file_path
:
str
)
->
d
ict
[
str
,
Any
]:
""" This method can be used to extract specific metadata
from files present in the archive."""
# pylint: disable=unused-argument,no-self-use
...
...
@@ -91,7 +90,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
@
staticmethod
@
abc
.
abstractmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
L
ist
[
ArchiveMember
]:
def
_get_all_members
(
archive
:
ArchiveClass
)
->
l
ist
[
ArchiveMember
]:
"""Return all the members of the archive."""
@
staticmethod
...
...
@@ -101,7 +100,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
@
staticmethod
@
abc
.
abstractmethod
def
_get_member_meta
(
member
:
ArchiveMember
)
->
D
ict
[
str
,
str
]:
def
_get_member_meta
(
member
:
ArchiveMember
)
->
d
ict
[
str
,
str
]:
"""Return all the metadata of a given member."""
@
staticmethod
...
...
@@ -132,8 +131,8 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument
return
member
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
dict
()
# type:
D
ict[str, Union[str, dict]]
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
dict
()
# type:
d
ict[str, Union[str, dict]]
with
self
.
archive_class
(
self
.
filename
)
as
zin
:
temp_folder
=
tempfile
.
mkdtemp
()
...
...
@@ -174,7 +173,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Sort the items to process, to reduce fingerprinting,
# and keep them in the `items` variable.
items
=
list
()
# type:
L
ist[ArchiveMember]
items
=
list
()
# type:
l
ist[ArchiveMember]
for
item
in
sorted
(
self
.
_get_all_members
(
zin
),
key
=
self
.
_get_member_name
):
# Some fileformats do require to have the `mimetype` file
# as the first file in the archive.
...
...
@@ -340,7 +339,7 @@ class TarParser(ArchiveBasedAbstractParser):
return
member
@
staticmethod
def
_get_member_meta
(
member
:
ArchiveMember
)
->
D
ict
[
str
,
str
]:
def
_get_member_meta
(
member
:
ArchiveMember
)
->
d
ict
[
str
,
str
]:
assert
isinstance
(
member
,
tarfile
.
TarInfo
)
# please mypy
metadata
=
{}
if
member
.
mtime
!=
0
:
...
...
@@ -362,7 +361,7 @@ class TarParser(ArchiveBasedAbstractParser):
archive
.
add
(
full_path
,
member
.
name
,
filter
=
TarParser
.
_clean_member
)
# type: ignore
@
staticmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
L
ist
[
ArchiveMember
]:
def
_get_all_members
(
archive
:
ArchiveClass
)
->
l
ist
[
ArchiveMember
]:
assert
isinstance
(
archive
,
tarfile
.
TarFile
)
# please mypy
return
archive
.
getmembers
()
# type: ignore
...
...
@@ -416,7 +415,7 @@ class ZipParser(ArchiveBasedAbstractParser):
return
member
@
staticmethod
def
_get_member_meta
(
member
:
ArchiveMember
)
->
D
ict
[
str
,
str
]:
def
_get_member_meta
(
member
:
ArchiveMember
)
->
d
ict
[
str
,
str
]:
assert
isinstance
(
member
,
zipfile
.
ZipInfo
)
# please mypy
metadata
=
{}
if
member
.
create_system
==
3
:
# this is Linux
...
...
@@ -443,7 +442,7 @@ class ZipParser(ArchiveBasedAbstractParser):
compress_type
=
member
.
compress_type
)
@
staticmethod
def
_get_all_members
(
archive
:
ArchiveClass
)
->
L
ist
[
ArchiveMember
]:
def
_get_all_members
(
archive
:
ArchiveClass
)
->
l
ist
[
ArchiveMember
]:
assert
isinstance
(
archive
,
zipfile
.
ZipFile
)
# please mypy
return
archive
.
infolist
()
# type: ignore
...
...
libmat2/audio.py
View file @
cc5be860
...
...
@@ -2,7 +2,7 @@ import mimetypes
import
os
import
shutil
import
tempfile
from
typing
import
Dict
,
Union
from
typing
import
Union
import
mutagen
...
...
@@ -18,7 +18,7 @@ class MutagenParser(abstract.AbstractParser):
except
mutagen
.
MutagenError
:
raise
ValueError
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
f
=
mutagen
.
File
(
self
.
filename
)
if
f
.
tags
:
return
{
k
:
', '
.
join
(
map
(
str
,
v
))
for
k
,
v
in
f
.
tags
.
items
()}
...
...
@@ -38,8 +38,8 @@ class MutagenParser(abstract.AbstractParser):
class
MP3Parser
(
MutagenParser
):
mimetypes
=
{
'audio/mpeg'
,
}
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
metadata
=
{}
# type:
D
ict[str, Union[str, dict]]
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
metadata
=
{}
# type:
d
ict[str, Union[str, dict]]
meta
=
mutagen
.
File
(
self
.
filename
).
tags
if
not
meta
:
return
metadata
...
...
@@ -68,7 +68,7 @@ class FLACParser(MutagenParser):
f
.
save
(
deleteid3
=
True
)
return
True
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
super
().
get_meta
()
for
num
,
picture
in
enumerate
(
mutagen
.
File
(
self
.
filename
).
pictures
):
name
=
picture
.
desc
if
picture
.
desc
else
'Cover %d'
%
num
...
...
libmat2/bubblewrap.py
View file @
cc5be860
...
...
@@ -12,7 +12,7 @@ import shutil
import
subprocess
import
tempfile
import
functools
from
typing
import
List
,
Optional
from
typing
import
Optional
__all__
=
[
'PIPE'
,
'run'
,
'CalledProcessError'
]
...
...
@@ -33,7 +33,7 @@ def _get_bwrap_path() -> str:
def
_get_bwrap_args
(
tempdir
:
str
,
input_filename
:
str
,
output_filename
:
Optional
[
str
]
=
None
)
->
L
ist
[
str
]:
output_filename
:
Optional
[
str
]
=
None
)
->
l
ist
[
str
]:
ro_bind_args
=
[]
cwd
=
os
.
getcwd
()
...
...
@@ -78,7 +78,7 @@ def _get_bwrap_args(tempdir: str,
return
args
def
run
(
args
:
L
ist
[
str
],
def
run
(
args
:
l
ist
[
str
],
input_filename
:
str
,
output_filename
:
Optional
[
str
]
=
None
,
**
kwargs
)
->
subprocess
.
CompletedProcess
:
...
...
libmat2/epub.py
View file @
cc5be860
...
...
@@ -3,7 +3,7 @@ import re
import
uuid
import
zipfile
import
xml.etree.ElementTree
as
ET
# type: ignore
from
typing
import
Dict
,
Any
from
typing
import
Any
from
.
import
archive
,
office
...
...
@@ -37,7 +37,7 @@ class EPUBParser(archive.ZipParser):
if
member_name
.
endswith
(
'META-INF/encryption.xml'
):
raise
ValueError
(
'the file contains encrypted fonts'
)
def
_specific_get_meta
(
self
,
full_path
,
file_path
)
->
D
ict
[
str
,
Any
]:
def
_specific_get_meta
(
self
,
full_path
,
file_path
)
->
d
ict
[
str
,
Any
]:
if
not
file_path
.
endswith
(
'.opf'
):
return
{}
...
...
libmat2/exiftool.py
View file @
cc5be860
...
...
@@ -4,23 +4,20 @@ import logging
import
os
import
shutil
import
subprocess
from
typing
import
Dict
,
Union
,
Set
from
typing
import
Union
from
.
import
abstract
from
.
import
bubblewrap
# Make pyflakes happy
assert
Set
class
ExiftoolParser
(
abstract
.
AbstractParser
):
""" Exiftool is often the easiest way to get all the metadata
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_allowlist
=
set
()
# type:
S
et[str]
meta_allowlist
=
set
()
# type:
s
et[str]
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
try
:
if
self
.
sandbox
:
out
=
bubblewrap
.
run
([
_get_exiftool_path
(),
'-json'
,
...
...
libmat2/harmless.py
View file @
cc5be860
import
shutil
from
typing
import
Dict
,
Union
from
typing
import
Union
from
.
import
abstract
...
...
@@ -7,7 +7,7 @@ class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that can not contain metadata. """
mimetypes
=
{
'text/plain'
,
'image/x-ms-bmp'
}
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
return
dict
()
def
remove_all
(
self
)
->
bool
:
...
...
libmat2/images.py
View file @
cc5be860
import
imghdr
import
os
import
re
from
typing
import
Set
,
Dict
,
Union
,
Any
from
typing
import
Union
,
Any
import
cairo
...
...
@@ -13,7 +13,6 @@ from gi.repository import GdkPixbuf, GLib, Rsvg
from
.
import
exiftool
,
abstract
# Make pyflakes happy
assert
Set
assert
Any
class
SVGParser
(
exiftool
.
ExiftoolParser
):
...
...
@@ -50,7 +49,7 @@ class SVGParser(exiftool.ExiftoolParser):
surface
.
finish
()
return
True
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
super
().
get_meta
()
# The namespace is mandatory, but only the …/2000/svg is valid.
...
...
@@ -165,8 +164,8 @@ class TiffParser(GdkPixbufAbstractParser):
class
PPMParser
(
abstract
.
AbstractParser
):
mimetypes
=
{
'image/x-portable-pixmap'
}
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
{}
# type:
D
ict[str, Union[str,
D
ict[Any, Any]]]
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
{}
# type:
d
ict[str, Union[str,
d
ict[Any, Any]]]
with
open
(
self
.
filename
)
as
f
:
for
idx
,
line
in
enumerate
(
f
):
if
line
.
lstrip
().
startswith
(
'#'
):
...
...
libmat2/office.py
View file @
cc5be860
...
...
@@ -4,7 +4,7 @@ import logging
import
os
import
re
import
zipfile
from
typing
import
Dict
,
Set
,
Pattern
,
Tuple
,
Any
from
typing
import
Pattern
,
Any
import
xml.etree.ElementTree
as
ET
# type: ignore
...
...
@@ -13,10 +13,9 @@ from .archive import ZipParser
# pylint: disable=line-too-long
# Make pyflakes happy
assert
Set
assert
Pattern
def
_parse_xml
(
full_path
:
str
)
->
T
uple
[
ET
.
ElementTree
,
D
ict
[
str
,
str
]]:
def
_parse_xml
(
full_path
:
str
)
->
t
uple
[
ET
.
ElementTree
,
d
ict
[
str
,
str
]]:
""" This function parses XML, with namespace support. """
namespace_map
=
dict
()
for
_
,
(
key
,
value
)
in
ET
.
iterparse
(
full_path
,
(
"start-ns"
,
)):
...
...
@@ -148,7 +147,7 @@ class MSOfficeParser(ZipParser):
return
False
xml_data
=
zin
.
read
(
'[Content_Types].xml'
)
self
.
content_types
=
dict
()
# type:
D
ict[str, str]
self
.
content_types
=
dict
()
# type:
d
ict[str, str]
try
:
tree
=
ET
.
fromstring
(
xml_data
)
except
ET
.
ParseError
:
...
...
@@ -431,7 +430,7 @@ class MSOfficeParser(ZipParser):
return
True
def
_specific_get_meta
(
self
,
full_path
:
str
,
file_path
:
str
)
->
D
ict
[
str
,
Any
]:
def
_specific_get_meta
(
self
,
full_path
:
str
,
file_path
:
str
)
->
d
ict
[
str
,
Any
]:
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
...
...
@@ -512,7 +511,7 @@ class LibreOfficeParser(ZipParser):
return
False
return
True
def
_specific_get_meta
(
self
,
full_path
:
str
,
file_path
:
str
)
->
D
ict
[
str
,
Any
]:
def
_specific_get_meta
(
self
,
full_path
:
str
,
file_path
:
str
)
->
d
ict
[
str
,
Any
]:
"""
Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want.
...
...
libmat2/parser_factory.py
View file @
cc5be860
...
...
@@ -2,7 +2,7 @@ import glob
import
os
import
mimetypes
import
importlib
from
typing
import
TypeVar
,
List
,
Tuple
,
Optional
from
typing
import
TypeVar
,
Optional
from
.
import
abstract
,
UNSUPPORTED_EXTENSIONS
...
...
@@ -34,7 +34,7 @@ def __load_all_parsers():
__load_all_parsers
()
def
_get_parsers
()
->
L
ist
[
T
]:
def
_get_parsers
()
->
l
ist
[
T
]:
""" Get all our parsers!"""
def
__get_parsers
(
cls
):
return
cls
.
__subclasses__
()
+
\
...
...
@@ -42,7 +42,7 @@ def _get_parsers() -> List[T]:
return
__get_parsers
(
abstract
.
AbstractParser
)
def
get_parser
(
filename
:
str
)
->
T
uple
[
Optional
[
T
],
Optional
[
str
]]:
def
get_parser
(
filename
:
str
)
->
t
uple
[
Optional
[
T
],
Optional
[
str
]]:
""" Return the appropriate parser for a given filename.
:raises ValueError: Raised if the instantiation of the parser went wrong.
...
...
libmat2/pdf.py
View file @
cc5be860
...
...
@@ -7,7 +7,7 @@ import re
import
logging
import
tempfile
import
io
from
typing
import
Dict
,
Union
from
typing
import
Union
from
distutils.version
import
LooseVersion
import
cairo
...
...
@@ -146,13 +146,13 @@ class PDFParser(abstract.AbstractParser):
return
True
@
staticmethod
def
__parse_metadata_field
(
data
:
str
)
->
D
ict
[
str
,
str
]:
def
__parse_metadata_field
(
data
:
str
)
->
d
ict
[
str
,
str
]:
metadata
=
{}
for
(
_
,
key
,
value
)
in
re
.
findall
(
r
"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>"
,
data
,
re
.
I
):
metadata
[
key
]
=
value
return
metadata
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
""" Return a dict with all the meta of the file
"""
metadata
=
{}
...
...
libmat2/torrent.py
View file @
cc5be860
import
logging
from
typing
import
Union
,
Tuple
,
Dict
from
typing
import
Union
from
.
import
abstract
...
...
@@ -15,7 +15,7 @@ class TorrentParser(abstract.AbstractParser):
if
self
.
dict_repr
is
None
:
raise
ValueError
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
metadata
=
{}
for
key
,
value
in
self
.
dict_repr
.
items
():
if
key
not
in
self
.
allowlist
:
...
...
@@ -56,7 +56,7 @@ class _BencodeHandler:
}
@
staticmethod
def
__decode_int
(
s
:
bytes
)
->
T
uple
[
int
,
bytes
]:
def
__decode_int
(
s
:
bytes
)
->
t
uple
[
int
,
bytes
]:
s
=
s
[
1
:]
next_idx
=
s
.
index
(
b
'e'
)
if
s
.
startswith
(
b
'-0'
):
...
...
@@ -66,7 +66,7 @@ class _BencodeHandler:
return
int
(
s
[:
next_idx
]),
s
[
next_idx
+
1
:]
@
staticmethod
def
__decode_string
(
s
:
bytes
)
->
T
uple
[
bytes
,
bytes
]:
def
__decode_string
(
s
:
bytes
)
->
t
uple
[
bytes
,
bytes
]:
colon
=
s
.
index
(
b
':'
)
# FIXME Python3 is broken here, the call to `ord` shouldn't be needed,
# but apparently it is. This is utterly idiotic.
...
...
@@ -76,7 +76,7 @@ class _BencodeHandler:
s
=
s
[
1
:]
return
s
[
colon
:
colon
+
str_len
],
s
[
colon
+
str_len
:]
def
__decode_list
(
self
,
s
:
bytes
)
->
T
uple
[
list
,
bytes
]:
def
__decode_list
(
self
,
s
:
bytes
)
->
t
uple
[
list
,
bytes
]:
ret
=
list
()
s
=
s
[
1
:]
# skip leading `l`
while
s
[
0
]
!=
ord
(
'e'
):
...
...
@@ -84,7 +84,7 @@ class _BencodeHandler:
ret
.
append
(
value
)
return
ret
,
s
[
1
:]
def
__decode_dict
(
self
,
s
:
bytes
)
->
T
uple
[
dict
,
bytes
]:
def
__decode_dict
(
self
,
s
:
bytes
)
->
t
uple
[
dict
,
bytes
]:
ret
=
dict
()
s
=
s
[
1
:]
# skip leading `d`
while
s
[
0
]
!=
ord
(
b
'e'
):
...
...
libmat2/video.py
View file @
cc5be860
...
...
@@ -3,7 +3,7 @@ import functools
import
shutil
import
logging
from
typing
import
Dict
,
Union
from
typing
import
Union
from
.
import
exiftool
from
.
import
bubblewrap
...
...
@@ -12,7 +12,7 @@ from . import bubblewrap
class
AbstractFFmpegParser
(
exiftool
.
ExiftoolParser
):
""" Abstract parser for all FFmpeg-based ones, mainly for video. """
# Some fileformats have mandatory metadata fields
meta_key_value_allowlist
=
{}
# type:
D
ict[str, Union[str, int]]
meta_key_value_allowlist
=
{}
# type:
d
ict[str, Union[str, int]]
def
remove_all
(
self
)
->
bool
:
if
self
.
meta_key_value_allowlist
:
...
...
@@ -45,10 +45,10 @@ class AbstractFFmpegParser(exiftool.ExiftoolParser):
return
False
return
True
def
get_meta
(
self
)
->
D
ict
[
str
,
Union
[
str
,
dict
]]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Union
[
str
,
dict
]]:
meta
=
super
().
get_meta
()
ret
=
dict
()
# type:
D
ict[str, Union[str, dict]]
ret
=
dict
()
# type:
d
ict[str, Union[str, dict]]
for
key
,
value
in
meta
.
items
():
if
key
in
self
.
meta_key_value_allowlist
:
if
value
==
self
.
meta_key_value_allowlist
[
key
]:
...
...
libmat2/web.py
View file @
cc5be860
from
html
import
parser
,
escape
from
typing
import
Dict
,
Any
,
List
,
Tuple
,
Set
,
Optional
from
typing
import
Any
,
Optional
import
re
import
string
from
.
import
abstract
assert
Set
# pylint: disable=too-many-instance-attributes
...
...
@@ -26,7 +25,7 @@ class CSSParser(abstract.AbstractParser):
f
.
write
(
cleaned
)
return
True
def
get_meta
(
self
)
->
D
ict
[
str
,
Any
]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Any
]:
metadata
=
{}
with
open
(
self
.
filename
,
encoding
=
'utf-8'
)
as
f
:
try
:
...
...
@@ -45,10 +44,10 @@ class CSSParser(abstract.AbstractParser):
class
AbstractHTMLParser
(
abstract
.
AbstractParser
):
tags_blocklist
=
set
()
# type:
S
et[str]
tags_blocklist
=
set
()
# type:
s
et[str]
# In some html/xml-based formats some tags are mandatory,
# so we're keeping them, but are discarding their content
tags_required_blocklist
=
set
()
# type:
S
et[str]
tags_required_blocklist
=
set
()
# type:
s
et[str]
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
...
...
@@ -58,7 +57,7 @@ class AbstractHTMLParser(abstract.AbstractParser):
self
.
__parser
.
feed
(
f
.
read
())
self
.
__parser
.
close
()
def
get_meta
(
self
)
->
D
ict
[
str
,
Any
]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Any
]:
return
self
.
__parser
.
get_meta
()
def
remove_all
(
self
)
->
bool
:
...
...
@@ -92,7 +91,7 @@ class _HTMLParser(parser.HTMLParser):
self
.
filename
=
filename
self
.
__textrepr
=
''
self
.
__meta
=
{}
self
.
__validation_queue
=
[]
# type:
L
ist[str]
self
.
__validation_queue
=
[]
# type:
l
ist[str]
# We're using counters instead of booleans, to handle nested tags
self
.
__in_dangerous_but_required_tag
=
0
...
...
@@ -114,7 +113,7 @@ class _HTMLParser(parser.HTMLParser):
"""
raise
ValueError
(
message
)
def
handle_starttag
(
self
,
tag
:
str
,
attrs
:
L
ist
[
T
uple
[
str
,
Optional
[
str
]]]):
def
handle_starttag
(
self
,
tag
:
str
,
attrs
:
l
ist
[
t
uple
[
str
,
Optional
[
str
]]]):
# Ignore the type, because mypy is too stupid to infer
# that get_starttag_text() can't return None.
original_tag
=
self
.
get_starttag_text
()
# type: ignore
...
...
@@ -161,7 +160,7 @@ class _HTMLParser(parser.HTMLParser):
self
.
__textrepr
+=
escape
(
data
)
def
handle_startendtag
(
self
,
tag
:
str
,
attrs
:
L
ist
[
T
uple
[
str
,
Optional
[
str
]]]):
attrs
:
l
ist
[
t
uple
[
str
,
Optional
[
str
]]]):
if
tag
in
self
.
tag_required_blocklist
|
self
.
tag_blocklist
:
meta
=
{
k
:
v
for
k
,
v
in
attrs
}
name
=
meta
.
get
(
'name'
,
'harmful metadata'
)
...
...
@@ -186,7 +185,7 @@ class _HTMLParser(parser.HTMLParser):
f
.
write
(
self
.
__textrepr
)
return
True
def
get_meta
(
self
)
->
D
ict
[
str
,
Any
]:
def
get_meta
(
self
)
->
d
ict
[
str
,
Any
]:
if
self
.
__validation_queue
:
raise
ValueError
(
"Some tags (%s) were left unclosed in %s"
%
(
', '
.
join
(
self
.
__validation_queue
),
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment