Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • #96
  • add_epub
  • handle_html
  • implement_lightweight_mode_msoffice
  • master
  • office
  • refactor_office
  • 0.1.0
  • 0.1.1
  • 0.1.2
  • 0.1.3
  • 0.2.0
  • 0.3.0
  • 0.3.1
  • 0.4.0
  • 0.5.0
  • 0.6.0
  • 0.7.0
18 results

Target

Select target project
  • tguinot/mat2
  • jvoisin/mat2
  • dachary/mat2
  • mejo-/mat2
  • LogicalDash/mat2
  • dkg/mat2
  • christian/mat2
  • Selflike323/mat2
  • fz/mat2
  • iwwmidatlanticgdc/mat2
  • Gu1nn3zz/mat2
  • smagnin/mat2
  • flashcode/mat2
  • MANCASTILLEJA/mat2
  • jboursier/mat2
  • tails/mat2
  • matiargs/mat2
  • Brolf/mat2
  • madaidan/mat2
  • Delmer84/mat2
  • yuebyzua/mat2
  • yyyyyyyan/mat2
  • rmnvgr/mat2
  • Marxism-Leninism/mat2
  • GNUtoo/mat2
  • allexj/mat2
  • b068931cc450442b63f5b3d276ea4297/mat2
  • chenrui/mat2
  • nosec13346/mat2
  • anelki/mat2
30 results
Select Git revision
  • fix_heic
  • master
  • 0.1.0
  • 0.1.1
  • 0.1.2
  • 0.1.3
  • 0.10.0
  • 0.10.1
  • 0.11.0
  • 0.12.0
  • 0.12.1
  • 0.12.2
  • 0.12.3
  • 0.12.4
  • 0.13.0
  • 0.13.1
  • 0.13.2
  • 0.13.3
  • 0.13.4
  • 0.13.5
  • 0.2.0
  • 0.3.0
  • 0.3.1
  • 0.4.0
  • 0.5.0
  • 0.6.0
  • 0.7.0
  • 0.8.0
  • 0.9.0
29 results
Show changes
Commits on Source (2)
from html import parser
from typing import Dict, Any, List, Tuple
from . import abstract
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', }
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser()
with open(filename) as f:
self.__parser.feed(f.read())
self.__parser.close()
def get_meta(self) -> Dict[str, Any]:
return self.__parser.get_meta()
def remove_all(self) -> bool:
return self.__parser.remove_all(self.output_filename)
class _HTMLParser(parser.HTMLParser):
"""Python doesn't have a validating html parser in its stdlib, so
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
def __init__(self):
super().__init__()
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag)
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError
elif tag != self.__validation_queue.pop():
raise ValueError
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str):
if data.strip():
self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta':
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError
with open(output_filename, 'w') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError
return self.__meta
<html>
<head>
<meta content="vim" name="generator"/>
<meta content="jvoisin" name="author"/>
</head>
<body>
<p>
<h1>Hello</h1>
I am a web page.
Please <b>love</b> me.
Here, have a pretty picture: <img src='dirty.jpg' alt='a pretty picture'/>
</p>
</body>
</html>
......@@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video
from libmat2 import harmless, video, html
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
......@@ -232,3 +232,40 @@ class TestCorruptedFiles(unittest.TestCase):
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>')
p = html.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
p = html.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('</close>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>')
p = html.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = html.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
......@@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies, video, archive
from libmat2 import check_dependencies, video, archive, html
class TestCheckDependencies(unittest.TestCase):
......@@ -596,3 +596,21 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.gif')
os.remove('./tests/data/clean.cleaned.gif')
os.remove('./tests/data/clean.cleaned.cleaned.gif')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
p = html.HTMLParser('./tests/data/clean.html')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
ret = p.remove_all()
self.assertTrue(ret)
p = html.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')