Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
mat2
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
12
Issues
12
List
Boards
Labels
Service Desk
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
jvoisin
mat2
Commits
55214206
Commit
55214206
authored
Feb 27, 2019
by
jvoisin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Improve the previous commit
- More tests - More documentation - Minor code cleanup
parent
73d2966e
Pipeline
#23033
failed with stages
in 4 minutes and 32 seconds
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
51 additions
and
18 deletions
+51
-18
libmat2/web.py
libmat2/web.py
+23
-15
tests/test_corrupted_files.py
tests/test_corrupted_files.py
+1
-3
tests/test_libmat2.py
tests/test_libmat2.py
+27
-0
No files found.
libmat2/web.py
View file @
55214206
...
...
@@ -38,8 +38,8 @@ class CSSParser(abstract.AbstractParser):
class
AbstractHTMLParser
(
abstract
.
AbstractParser
):
tags_blacklist
=
set
()
# type: Set[str]
# In some html/xml
based formats some tags are mandatory,
# so we're keeping them, but are discaring their content
s
# In some html/xml
-
based formats some tags are mandatory,
# so we're keeping them, but are discaring their content
tags_required_blacklist
=
set
()
# type: Set[str]
def
__init__
(
self
,
filename
):
...
...
@@ -72,6 +72,12 @@ class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
method, so we have to use get_starttag_text instead, put its result in a
LIFO, and transform it in a closing tag when needed.
Also, gotcha: the `tag` parameters are always in lowercase.
"""
def
__init__
(
self
,
filename
,
blacklisted_tags
,
required_blacklisted_tags
):
super
().
__init__
()
...
...
@@ -79,6 +85,7 @@ class _HTMLParser(parser.HTMLParser):
self
.
__textrepr
=
''
self
.
__meta
=
{}
self
.
__validation_queue
=
[]
# type: List[str]
# We're using counters instead of booleans, to handle nested tags
self
.
__in_dangerous_but_required_tag
=
0
self
.
__in_dangerous_tag
=
0
...
...
@@ -93,15 +100,16 @@ class _HTMLParser(parser.HTMLParser):
original_tag
=
self
.
get_starttag_text
()
self
.
__validation_queue
.
append
(
original_tag
)
if
tag
in
self
.
tag_required_blacklist
:
self
.
__in_dangerous_but_required_tag
+=
1
if
tag
in
self
.
tag_blacklist
:
self
.
__in_dangerous_tag
+=
1
if
self
.
__in_dangerous_tag
==
0
:
if
self
.
__in_dangerous_but_required_tag
<=
1
:
if
self
.
__in_dangerous_but_required_tag
==
0
:
self
.
__textrepr
+=
original_tag
if
tag
in
self
.
tag_required_blacklist
:
self
.
__in_dangerous_but_required_tag
+=
1
def
handle_endtag
(
self
,
tag
:
str
):
if
not
self
.
__validation_queue
:
raise
ValueError
(
"The closing tag %s doesn't have a corresponding "
...
...
@@ -115,14 +123,15 @@ class _HTMLParser(parser.HTMLParser):
"tag %s in %s"
%
(
tag
,
previous_tag
,
self
.
filename
))
if
tag
in
self
.
tag_required_blacklist
:
self
.
__in_dangerous_but_required_tag
-=
1
if
self
.
__in_dangerous_tag
==
0
:
if
self
.
__in_dangerous_but_required_tag
<=
1
:
if
self
.
__in_dangerous_but_required_tag
==
0
:
# There is no `get_endtag_text()` method :/
self
.
__textrepr
+=
'</'
+
previous_tag
+
'>'
if
tag
in
self
.
tag_required_blacklist
:
self
.
__in_dangerous_but_required_tag
-=
1
elif
tag
in
self
.
tag_blacklist
:
if
tag
in
self
.
tag_blacklist
:
self
.
__in_dangerous_tag
-=
1
def
handle_data
(
self
,
data
:
str
):
...
...
@@ -138,14 +147,13 @@ class _HTMLParser(parser.HTMLParser):
content
=
meta
.
get
(
'content'
,
'harmful data'
)
self
.
__meta
[
name
]
=
content
if
self
.
__in_dangerous_tag
!=
0
:
if
self
.
__in_dangerous_tag
==
0
:
if
tag
in
self
.
tag_required_blacklist
:
self
.
__textrepr
+=
'<'
+
tag
+
' />'
return
elif
tag
in
self
.
tag_required_blacklist
:
self
.
__textrepr
+=
'<'
+
tag
+
' />'
return
if
self
.
__in_dangerous_
but_required_
tag
==
0
:
if
self
.
__in_dangerous_tag
==
0
:
if
self
.
__in_dangerous_tag
==
0
:
if
self
.
__in_dangerous_
but_required_
tag
==
0
:
self
.
__textrepr
+=
self
.
get_starttag_text
()
def
remove_all
(
self
,
output_filename
:
str
)
->
bool
:
...
...
tests/test_corrupted_files.py
View file @
55214206
...
...
@@ -269,9 +269,6 @@ class TestCorruptedFiles(unittest.TestCase):
os
.
remove
(
'./tests/data/clean.html'
)
with
open
(
'./tests/data/clean.html'
,
'w'
)
as
f
:
f
.
write
(
'<meta><meta/></meta>'
)
f
.
write
(
'<title><title>pouet</title></title>'
)
f
.
write
(
'<title><mysupertag/></title>'
)
f
.
write
(
'<doctitle><br/></doctitle><br/><notclosed>'
)
p
=
web
.
HTMLParser
(
'./tests/data/clean.html'
)
with
self
.
assertRaises
(
ValueError
):
...
...
@@ -281,6 +278,7 @@ class TestCorruptedFiles(unittest.TestCase):
p
.
remove_all
()
os
.
remove
(
'./tests/data/clean.html'
)
def
test_epub
(
self
):
with
zipfile
.
ZipFile
(
'./tests/data/clean.epub'
,
'w'
)
as
zout
:
zout
.
write
(
'./tests/data/dirty.jpg'
,
'OEBPS/content.opf'
)
...
...
tests/test_libmat2.py
View file @
55214206
...
...
@@ -633,6 +633,33 @@ class TestCleaning(unittest.TestCase):
os
.
remove
(
'./tests/data/clean.cleaned.html'
)
os
.
remove
(
'./tests/data/clean.cleaned.cleaned.html'
)
with
open
(
'./tests/data/clean.html'
,
'w'
)
as
f
:
f
.
write
(
'<title><title><pouet/><meta/></title></title><test/>'
)
p
=
web
.
HTMLParser
(
'./tests/data/clean.html'
)
self
.
assertTrue
(
p
.
remove_all
())
with
open
(
'./tests/data/clean.cleaned.html'
,
'r'
)
as
f
:
self
.
assertEqual
(
f
.
read
(),
'<title></title><test/>'
)
os
.
remove
(
'./tests/data/clean.html'
)
os
.
remove
(
'./tests/data/clean.cleaned.html'
)
with
open
(
'./tests/data/clean.html'
,
'w'
)
as
f
:
f
.
write
(
'<test><title>Some<b>metadata</b><br/></title></test>'
)
p
=
web
.
HTMLParser
(
'./tests/data/clean.html'
)
self
.
assertTrue
(
p
.
remove_all
())
with
open
(
'./tests/data/clean.cleaned.html'
,
'r'
)
as
f
:
self
.
assertEqual
(
f
.
read
(),
'<test><title></title></test>'
)
os
.
remove
(
'./tests/data/clean.html'
)
os
.
remove
(
'./tests/data/clean.cleaned.html'
)
with
open
(
'./tests/data/clean.html'
,
'w'
)
as
f
:
f
.
write
(
'<meta><meta/></meta>'
)
p
=
web
.
HTMLParser
(
'./tests/data/clean.html'
)
self
.
assertTrue
(
p
.
remove_all
())
with
open
(
'./tests/data/clean.cleaned.html'
,
'r'
)
as
f
:
self
.
assertEqual
(
f
.
read
(),
''
)
os
.
remove
(
'./tests/data/clean.html'
)
os
.
remove
(
'./tests/data/clean.cleaned.html'
)
def
test_epub
(
self
):
shutil
.
copy
(
'./tests/data/dirty.epub'
,
'./tests/data/clean.epub'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment