Skip to content
Snippets Groups Projects
Commit b983d283 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Add a preliminary implementation of epub support

parent 6b45064c
Branches add_epub
No related tags found
No related merge requests found
import logging
import re
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
}))
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
return {}
with open(full_path, encoding='utf-8') as f:
try:
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError):
# We didn't manage to parse the xml file
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
if not full_path.endswith('OEBPS/content.opf'):
return True
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
parent_map = {c:p for p in tree.iter() for c in p}
for item in tree.iterfind('.//', namespace):
if item.tag.strip().lower().endswith('metadata'):
parent_map[item].remove(item)
break
tree.write(full_path, xml_declaration=True)
return True
...@@ -5,7 +5,7 @@ from . import abstract ...@@ -5,7 +5,7 @@ from . import abstract
class HarmlessParser(abstract.AbstractParser): class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that can not contain metadata. """ """ This is the parser for filetypes that can not contain metadata. """
mimetypes = {'text/plain', 'image/x-ms-bmp'} mimetypes = {'text/plain', 'image/x-ms-bmp', }
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, dict]]:
return dict() return dict()
......
...@@ -10,6 +10,9 @@ assert Tuple # make pyflakes happy ...@@ -10,6 +10,9 @@ assert Tuple # make pyflakes happy
T = TypeVar('T', bound='abstract.AbstractParser') T = TypeVar('T', bound='abstract.AbstractParser')
mimetypes.add_type('application/epub+zip', '.epub')
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
def __load_all_parsers(): def __load_all_parsers():
""" Loads every parser in a dynamic way """ """ Loads every parser in a dynamic way """
......
from html import parser from html import parser
from typing import Dict, Any, List, Tuple from typing import Dict, Any, List, Tuple
import re
import string
from . import abstract from . import abstract
class CSSParser(abstract.AbstractParser):
mimetypes = {'text/css', }
flags = re.MULTILINE | re.DOTALL
def remove_all(self) -> bool:
with open(self.filename, encoding='utf-8') as f:
cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
with open(self.output_filename, 'w', encoding='utf-8') as f:
f.write(cleaned)
return True
def get_meta(self) -> Dict[str, Any]:
metadata = {}
with open(self.filename, encoding='utf-8') as f:
cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
for match in cssdoc:
for line in match.splitlines():
try:
k, v = line.split(':')
metadata[k.strip(string.whitespace + '*')] = v.strip()
except ValueError:
metadata['harmful data'] = line.strip()
return metadata
class HTMLParser(abstract.AbstractParser): class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', } mimetypes = {'text/html', 'application/x-dtbncx+xml', }
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.__parser = _HTMLParser() self.__parser = _HTMLParser(self.filename)
with open(filename) as f: with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read()) self.__parser.feed(f.read())
self.__parser.close() self.__parser.close()
...@@ -25,45 +52,69 @@ class _HTMLParser(parser.HTMLParser): ...@@ -25,45 +52,69 @@ class _HTMLParser(parser.HTMLParser):
we're using an internal queue to track all the opening/closing tags, we're using an internal queue to track all the opening/closing tags,
and hoping for the best. and hoping for the best.
""" """
def __init__(self): tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
def __init__(self, filename):
super().__init__() super().__init__()
self.filename = filename
self.__textrepr = '' self.__textrepr = ''
self.__meta = {} self.__meta = {}
self.__validation_queue = [] self.__validation_queue = []
# We're using a counter instead of a boolean to handle nested tags
self.__in_dangerous_tag = 0
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]): def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag) self.__validation_queue.append(tag)
if tag in self.tag_blacklist:
self.__in_dangerous_tag += 1
return
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()
def handle_endtag(self, tag: str): def handle_endtag(self, tag: str):
if not self.__validation_queue: if not self.__validation_queue:
raise ValueError raise ValueError("The closing tag %s doesn't have a corresponding "
elif tag != self.__validation_queue.pop(): "opening one in %s." % (tag, self.filename))
raise ValueError
previous_tag = self.__validation_queue.pop()
if tag != previous_tag:
raise ValueError("The closing tag %s doesn't match the previous "
"tag %s in %s" %
(tag, previous_tag, self.filename))
elif tag in self.tag_blacklist:
self.__in_dangerous_tag -= 1
return
if self.__in_dangerous_tag == 0:
# There is no `get_endtag_text()` method :/ # There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n' self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str): def handle_data(self, data: str):
if data.strip(): if self.__in_dangerous_tag == 0 and data.strip():
self.__textrepr += data self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]): def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta': if tag in self.tag_blacklist:
meta = {k:v for k, v in attrs} meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata') name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data') content = meta.get('content', 'harmful data')
self.__meta[name] = content self.__meta[name] = content
else: else:
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text() self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool: def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue: if self.__validation_queue:
raise ValueError raise ValueError("Some tags (%s) were left unclosed in %s" % (
with open(output_filename, 'w') as f: ', '.join(self.__validation_queue),
self.filename))
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(self.__textrepr) f.write(self.__textrepr)
return True return True
def get_meta(self) -> Dict[str, Any]: def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue: if self.__validation_queue:
raise ValueError raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
return self.__meta return self.__meta
/**
* This is my super css framework
* version: 1.0
* author : jvoisin
*/
body {
color: red;
background-color: blue;
}
.underline {
text-decoration: underline; /* underline is cool */
}
File added
File added
...@@ -7,7 +7,7 @@ import logging ...@@ -7,7 +7,7 @@ import logging
import zipfile import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video, html from libmat2 import harmless, video, web
# No need to logging messages, should something go wrong, # No need to logging messages, should something go wrong,
# the testsuite _will_ fail. # the testsuite _will_ fail.
...@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase): ...@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/--output.avi') os.remove('./tests/data/--output.avi')
def test_zip(self): def test_zip(self):
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout: with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
zout.write('./tests/data/dirty.flac') zout.write('./tests/data/dirty.flac')
zout.write('./tests/data/dirty.docx') zout.write('./tests/data/dirty.docx')
zout.write('./tests/data/dirty.jpg') zout.write('./tests/data/dirty.jpg')
zout.write('./tests/data/embedded_corrupted.docx') zout.write('./tests/data/embedded_corrupted.docx')
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip') p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
self.assertEqual(mimetype, 'application/zip') self.assertEqual(mimetype, 'application/zip')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !') self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!') self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all()) self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip') os.remove('./tests/data/clean.zip')
def test_html(self): def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f: with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>') f.write('<open>but not</closed>')
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html') web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/ # Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f: with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>') f.write('<meta name=\'this" is="weird"/>')
p = html.HTMLParser('./tests/data/clean.html') p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all()) self.assertTrue(p.remove_all())
p = html.HTMLParser('./tests/data/clean.cleaned.html') p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {}) self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html') os.remove('./tests/data/clean.cleaned.html')
...@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase): ...@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
with open('./tests/data/clean.html', 'w') as f: with open('./tests/data/clean.html', 'w') as f:
f.write('</close>') f.write('</close>')
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html') web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f: with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>') f.write('<notclosed>')
p = html.HTMLParser('./tests/data/clean.html') p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
p.get_meta() p.get_meta()
p = html.HTMLParser('./tests/data/clean.html') p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
p.remove_all() p.remove_all()
os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<doctitle><br/></doctitle><br/><notclosed>')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
def test_epub(self):
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
'harmful content')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.epub')
...@@ -6,7 +6,7 @@ import os ...@@ -6,7 +6,7 @@ import os
import zipfile import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies, video, archive, html from libmat2 import check_dependencies, video, archive, web, epub
class TestCheckDependencies(unittest.TestCase): class TestCheckDependencies(unittest.TestCase):
...@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase): ...@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['Comment'], 'this is a test comment') self.assertEqual(meta['Comment'], 'this is a test comment')
def test_epub(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
def test_css(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
self.assertEqual(mimetype, 'text/css')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
self.assertEqual(meta['version'], '1.0')
self.assertEqual(meta['harmful data'], 'underline is cool')
class TestRemovingThumbnails(unittest.TestCase): class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self): def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt') shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
...@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase): ...@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
def test_html(self): def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html') shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
p = html.HTMLParser('./tests/data/clean.html') p = web.HTMLParser('./tests/data/clean.html')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin') self.assertEqual(meta['author'], 'jvoisin')
...@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase): ...@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
ret = p.remove_all() ret = p.remove_all()
self.assertTrue(ret) self.assertTrue(ret)
p = html.HTMLParser('./tests/data/clean.cleaned.html') p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {}) self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all()) self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html') os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html') os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html') os.remove('./tests/data/clean.cleaned.cleaned.html')
def test_epub(self):
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
p = epub.EPUBParser('./tests/data/clean.epub')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
ret = p.remove_all()
self.assertTrue(ret)
p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.epub')
os.remove('./tests/data/clean.cleaned.epub')
os.remove('./tests/data/clean.cleaned.cleaned.epub')
def test_css(self):
shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
p = web.CSSParser('./tests/data/clean.css')
self.assertEqual(p.get_meta(), {
'harmful data': 'underline is cool',
'version': '1.0',
'author': 'jvoisin'})
ret = p.remove_all()
self.assertTrue(ret)
p = web.CSSParser('./tests/data/clean.cleaned.css')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.css')
os.remove('./tests/data/clean.cleaned.css')
os.remove('./tests/data/clean.cleaned.cleaned.css')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment