Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
mat2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Brolf
mat2
Compare revisions
master to handle_html
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
Brolf/mat2
Select target project
No results found
handle_html
Select Git revision
Branches
#96
add_epub
handle_html
implement_lightweight_mode_msoffice
master
office
refactor_office
Tags
0.1.0
0.1.1
0.1.2
0.1.3
0.2.0
0.3.0
0.3.1
0.4.0
0.5.0
0.6.0
0.7.0
18 results
Swap
Target
jvoisin/mat2
Select target project
tguinot/mat2
jvoisin/mat2
dachary/mat2
mejo-/mat2
LogicalDash/mat2
dkg/mat2
christian/mat2
Selflike323/mat2
fz/mat2
iwwmidatlanticgdc/mat2
Gu1nn3zz/mat2
smagnin/mat2
flashcode/mat2
MANCASTILLEJA/mat2
jboursier/mat2
tails/mat2
matiargs/mat2
Brolf/mat2
madaidan/mat2
Delmer84/mat2
yuebyzua/mat2
yyyyyyyan/mat2
rmnvgr/mat2
Marxism-Leninism/mat2
GNUtoo/mat2
allexj/mat2
b068931cc450442b63f5b3d276ea4297/mat2
chenrui/mat2
nosec13346/mat2
anelki/mat2
30 results
master
Select Git revision
Branches
fix_heic
master
Tags
0.1.0
0.1.1
0.1.2
0.1.3
0.10.0
0.10.1
0.11.0
0.12.0
0.12.1
0.12.2
0.12.3
0.12.4
0.13.0
0.13.1
0.13.2
0.13.3
0.13.4
0.13.5
0.2.0
0.3.0
0.3.1
0.4.0
0.5.0
0.6.0
0.7.0
0.8.0
0.9.0
29 results
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
Add support for html files
· 06283b00
Julien (jvoisin) Voisin
authored
6 years ago
06283b00
Bump coverage
· 914b2de6
Julien (jvoisin) Voisin
authored
6 years ago
914b2de6
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
libmat2/html.py
+69
-0
69 additions, 0 deletions
libmat2/html.py
tests/data/dirty.html
+14
-0
14 additions, 0 deletions
tests/data/dirty.html
tests/test_corrupted_files.py
+38
-1
38 additions, 1 deletion
tests/test_corrupted_files.py
tests/test_libmat2.py
+19
-1
19 additions, 1 deletion
tests/test_libmat2.py
with
140 additions
and
2 deletions
libmat2/html.py
0 → 100644
View file @
914b2de6
from
html
import
parser
from
typing
import
Dict
,
Any
,
List
,
Tuple
from
.
import
abstract
class
HTMLParser
(
abstract
.
AbstractParser
):
mimetypes
=
{
'
text/html
'
,
}
def
__init__
(
self
,
filename
):
super
().
__init__
(
filename
)
self
.
__parser
=
_HTMLParser
()
with
open
(
filename
)
as
f
:
self
.
__parser
.
feed
(
f
.
read
())
self
.
__parser
.
close
()
def
get_meta
(
self
)
->
Dict
[
str
,
Any
]:
return
self
.
__parser
.
get_meta
()
def
remove_all
(
self
)
->
bool
:
return
self
.
__parser
.
remove_all
(
self
.
output_filename
)
class
_HTMLParser
(
parser
.
HTMLParser
):
"""
Python doesn
'
t have a validating html parser in its stdlib, so
we
'
re using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
def
__init__
(
self
):
super
().
__init__
()
self
.
__textrepr
=
''
self
.
__meta
=
{}
self
.
__validation_queue
=
[]
def
handle_starttag
(
self
,
tag
:
str
,
attrs
:
List
[
Tuple
[
str
,
str
]]):
self
.
__textrepr
+=
self
.
get_starttag_text
()
self
.
__validation_queue
.
append
(
tag
)
def
handle_endtag
(
self
,
tag
:
str
):
if
not
self
.
__validation_queue
:
raise
ValueError
elif
tag
!=
self
.
__validation_queue
.
pop
():
raise
ValueError
# There is no `get_endtag_text()` method :/
self
.
__textrepr
+=
'
</
'
+
tag
+
'
>
\n
'
def
handle_data
(
self
,
data
:
str
):
if
data
.
strip
():
self
.
__textrepr
+=
data
def
handle_startendtag
(
self
,
tag
:
str
,
attrs
:
List
[
Tuple
[
str
,
str
]]):
if
tag
==
'
meta
'
:
meta
=
{
k
:
v
for
k
,
v
in
attrs
}
name
=
meta
.
get
(
'
name
'
,
'
harmful metadata
'
)
content
=
meta
.
get
(
'
content
'
,
'
harmful data
'
)
self
.
__meta
[
name
]
=
content
else
:
self
.
__textrepr
+=
self
.
get_starttag_text
()
def
remove_all
(
self
,
output_filename
:
str
)
->
bool
:
if
self
.
__validation_queue
:
raise
ValueError
with
open
(
output_filename
,
'
w
'
)
as
f
:
f
.
write
(
self
.
__textrepr
)
return
True
def
get_meta
(
self
)
->
Dict
[
str
,
Any
]:
if
self
.
__validation_queue
:
raise
ValueError
return
self
.
__meta
This diff is collapsed.
Click to expand it.
tests/data/dirty.html
0 → 100644
View file @
914b2de6
<html>
<head>
<meta
content=
"vim"
name=
"generator"
/>
<meta
content=
"jvoisin"
name=
"author"
/>
</head>
<body>
<p>
<h1>
Hello
</h1>
I am a web page.
Please
<b>
love
</b>
me.
Here, have a pretty picture:
<img
src=
'dirty.jpg'
alt=
'a pretty picture'
/>
</p>
</body>
</html>
This diff is collapsed.
Click to expand it.
tests/test_corrupted_files.py
View file @
914b2de6
...
...
@@ -7,7 +7,7 @@ import logging
import
zipfile
from
libmat2
import
pdf
,
images
,
audio
,
office
,
parser_factory
,
torrent
from
libmat2
import
harmless
,
video
from
libmat2
import
harmless
,
video
,
html
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
...
...
@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
self
.
assertEqual
(
meta
[
'
tests/data/dirty.docx
'
][
'
word/media/image1.png
'
][
'
Comment
'
],
'
This is a comment, be careful!
'
)
self
.
assertFalse
(
p
.
remove_all
())
os
.
remove
(
'
./tests/data/dirty.zip
'
)
def
test_html
(
self
):
shutil
.
copy
(
'
./tests/data/dirty.html
'
,
'
./tests/data/clean.html
'
)
with
open
(
'
./tests/data/clean.html
'
,
'
a
'
)
as
f
:
f
.
write
(
'
<open>but not</closed>
'
)
with
self
.
assertRaises
(
ValueError
):
html
.
HTMLParser
(
'
./tests/data/clean.html
'
)
os
.
remove
(
'
./tests/data/clean.html
'
)
# Yes, we're able to deal with malformed html :/
shutil
.
copy
(
'
./tests/data/dirty.html
'
,
'
./tests/data/clean.html
'
)
with
open
(
'
./tests/data/clean.html
'
,
'
a
'
)
as
f
:
f
.
write
(
'
<meta name=
\'
this
"
is=
"
weird
"
/>
'
)
p
=
html
.
HTMLParser
(
'
./tests/data/clean.html
'
)
self
.
assertTrue
(
p
.
remove_all
())
p
=
html
.
HTMLParser
(
'
./tests/data/clean.cleaned.html
'
)
self
.
assertEqual
(
p
.
get_meta
(),
{})
os
.
remove
(
'
./tests/data/clean.html
'
)
os
.
remove
(
'
./tests/data/clean.cleaned.html
'
)
with
open
(
'
./tests/data/clean.html
'
,
'
w
'
)
as
f
:
f
.
write
(
'
</close>
'
)
with
self
.
assertRaises
(
ValueError
):
html
.
HTMLParser
(
'
./tests/data/clean.html
'
)
os
.
remove
(
'
./tests/data/clean.html
'
)
with
open
(
'
./tests/data/clean.html
'
,
'
w
'
)
as
f
:
f
.
write
(
'
<notclosed>
'
)
p
=
html
.
HTMLParser
(
'
./tests/data/clean.html
'
)
with
self
.
assertRaises
(
ValueError
):
p
.
get_meta
()
p
=
html
.
HTMLParser
(
'
./tests/data/clean.html
'
)
with
self
.
assertRaises
(
ValueError
):
p
.
remove_all
()
os
.
remove
(
'
./tests/data/clean.html
'
)
This diff is collapsed.
Click to expand it.
tests/test_libmat2.py
View file @
914b2de6
...
...
@@ -6,7 +6,7 @@ import os
import
zipfile
from
libmat2
import
pdf
,
images
,
audio
,
office
,
parser_factory
,
torrent
,
harmless
from
libmat2
import
check_dependencies
,
video
,
archive
from
libmat2
import
check_dependencies
,
video
,
archive
,
html
class
TestCheckDependencies
(
unittest
.
TestCase
):
...
...
@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
os
.
remove
(
'
./tests/data/clean.gif
'
)
os
.
remove
(
'
./tests/data/clean.cleaned.gif
'
)
os
.
remove
(
'
./tests/data/clean.cleaned.cleaned.gif
'
)
def
test_html
(
self
):
shutil
.
copy
(
'
./tests/data/dirty.html
'
,
'
./tests/data/clean.html
'
)
p
=
html
.
HTMLParser
(
'
./tests/data/clean.html
'
)
meta
=
p
.
get_meta
()
self
.
assertEqual
(
meta
[
'
author
'
],
'
jvoisin
'
)
ret
=
p
.
remove_all
()
self
.
assertTrue
(
ret
)
p
=
html
.
HTMLParser
(
'
./tests/data/clean.cleaned.html
'
)
self
.
assertEqual
(
p
.
get_meta
(),
{})
self
.
assertTrue
(
p
.
remove_all
())
os
.
remove
(
'
./tests/data/clean.html
'
)
os
.
remove
(
'
./tests/data/clean.cleaned.html
'
)
os
.
remove
(
'
./tests/data/clean.cleaned.cleaned.html
'
)
This diff is collapsed.
Click to expand it.