Skip to content
Snippets Groups Projects
Commit 6cc034e8 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Add support for html files

parent e1dd439f
No related branches found
No related tags found
No related merge requests found
from html import parser
from typing import Dict, Any, List, Tuple
from . import abstract
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', }
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser()
with open(filename) as f:
self.__parser.feed(f.read())
self.__parser.close()
def get_meta(self) -> Dict[str, Any]:
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
def __init__(self):
super().__init__()
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag)
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError
elif tag != self.__validation_queue.pop():
raise ValueError
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str):
if data.strip():
self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta':
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError
with open(output_filename, 'w') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError
return self.__meta
<html>
<head>
<meta content="vim" name="generator"/>
<meta content="jvoisin" name="author"/>
</head>
<body>
<p>
<h1>Hello</h1>
I am a web page.
Please <b>love</b> me.
Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
</p>
</body>
</html>
......@@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video
from libmat2 import harmless, video, html
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
......@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>')
p = html.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
p = html.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('</close>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>')
p = html.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = html.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
......@@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies, video, archive
from libmat2 import check_dependencies, video, archive, html
class TestCheckDependencies(unittest.TestCase):
......@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.gif')
os.remove('./tests/data/clean.cleaned.gif')
os.remove('./tests/data/clean.cleaned.cleaned.gif')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
p = html.HTMLParser('./tests/data/clean.html')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
ret = p.remove_all()
self.assertTrue(ret)
p = html.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment