Skip to content
Snippets Groups Projects
Commit 02ff21b1 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Implement epub support

parent 6b45064c
No related branches found
No related tags found
No related merge requests found
import logging
import re
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
}))
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
return {}
with open(full_path, encoding='utf-8') as f:
try:
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError):
# We didn't manage to parse the xml file
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
if not full_path.endswith('OEBPS/content.opf'):
return True
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
parent_map = {c:p for p in tree.iter() for c in p}
for item in tree.iterfind('.//', namespace):
if item.tag.strip().lower().endswith('metadata'):
parent_map[item].remove(item)
break # there is only a single <metadata> block
tree.write(full_path, xml_declaration=True)
return True
import logging
import glob
import os
import mimetypes
......@@ -10,6 +11,10 @@ assert Tuple # make pyflakes happy
T = TypeVar('T', bound='abstract.AbstractParser')
mimetypes.add_type('application/epub+zip', '.epub')
# EPUB Navigation Control XML File
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
def __load_all_parsers():
""" Loads every parser in a dynamic way """
......@@ -49,6 +54,8 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
if mtype in parser_class.mimetypes:
try:
return parser_class(filename), mtype
except ValueError:
except ValueError as e:
logging.info("Got an exception when trying to instanciate "
"%s for %s: %s", parser_class, filename, e)
return None, mtype
return None, mtype
from html import parser
from typing import Dict, Any, List, Tuple
import re
import string
from . import abstract
class CSSParser(abstract.AbstractParser):
"""There is no such things as metadata in CSS files,
only comments of the form `/* … */`, so we're removing the laters."""
mimetypes = {'text/css', }
flags = re.MULTILINE | re.DOTALL
def remove_all(self) -> bool:
with open(self.filename, encoding='utf-8') as f:
cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
with open(self.output_filename, 'w', encoding='utf-8') as f:
f.write(cleaned)
return True
def get_meta(self) -> Dict[str, Any]:
metadata = {}
with open(self.filename, encoding='utf-8') as f:
cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
for match in cssdoc:
for line in match.splitlines():
try:
k, v = line.split(':')
metadata[k.strip(string.whitespace + '*')] = v.strip()
except ValueError:
metadata['harmful data'] = line.strip()
return metadata
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', }
mimetypes = {'text/html', 'application/x-dtbncx+xml', }
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser()
with open(filename) as f:
self.__parser = _HTMLParser(self.filename)
with open(filename, encoding='utf-8') as f:
self.__parser.feed(f.read())
self.__parser.close()
......@@ -25,45 +54,69 @@ class _HTMLParser(parser.HTMLParser):
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
def __init__(self):
tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
def __init__(self, filename):
super().__init__()
self.filename = filename
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
# We're using a counter instead of a boolean to handle nested tags
self.__in_dangerous_tag = 0
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag)
if tag in self.tag_blacklist:
self.__in_dangerous_tag += 1
return
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError
elif tag != self.__validation_queue.pop():
raise ValueError
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
raise ValueError("The closing tag %s doesn't have a corresponding "
"opening one in %s." % (tag, self.filename))
previous_tag = self.__validation_queue.pop()
if tag != previous_tag:
raise ValueError("The closing tag %s doesn't match the previous "
"tag %s in %s" %
(tag, previous_tag, self.filename))
elif tag in self.tag_blacklist:
self.__in_dangerous_tag -= 1
return
if self.__in_dangerous_tag == 0:
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str):
if data.strip():
if self.__in_dangerous_tag == 0 and data.strip():
self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta':
if tag in self.tag_blacklist:
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
self.__textrepr += self.get_starttag_text()
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
raise ValueError
with open(output_filename, 'w') as f:
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(self.__textrepr)
return True
def get_meta(self) -> Dict[str, Any]:
if self.__validation_queue:
raise ValueError
raise ValueError("Some tags (%s) were left unclosed in %s" % (
', '.join(self.__validation_queue),
self.filename))
return self.__meta
/**
* This is my super css framework
* version: 1.0
* author : jvoisin
*/
body {
color: red;
background-color: blue;
}
.underline {
text-decoration: underline; /* underline is cool */
}
File added
File added
......@@ -7,7 +7,7 @@ import logging
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video, html
from libmat2 import harmless, video, web
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
......@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
os.remove('./tests/data/--output.avi')
def test_zip(self):
with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
zout.write('./tests/data/dirty.flac')
zout.write('./tests/data/dirty.docx')
zout.write('./tests/data/dirty.jpg')
zout.write('./tests/data/embedded_corrupted.docx')
p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
self.assertEqual(mimetype, 'application/zip')
meta = p.get_meta()
self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/dirty.zip')
os.remove('./tests/data/clean.zip')
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<open>but not</closed>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
# Yes, we're able to deal with malformed html :/
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
with open('./tests/data/clean.html', 'a') as f:
f.write('<meta name=\'this" is="weird"/>')
p = html.HTMLParser('./tests/data/clean.html')
p = web.HTMLParser('./tests/data/clean.html')
self.assertTrue(p.remove_all())
p = html.HTMLParser('./tests/data/clean.cleaned.html')
p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
......@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
with open('./tests/data/clean.html', 'w') as f:
f.write('</close>')
with self.assertRaises(ValueError):
html.HTMLParser('./tests/data/clean.html')
web.HTMLParser('./tests/data/clean.html')
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<notclosed>')
p = html.HTMLParser('./tests/data/clean.html')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = html.HTMLParser('./tests/data/clean.html')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
with open('./tests/data/clean.html', 'w') as f:
f.write('<doctitle><br/></doctitle><br/><notclosed>')
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.get_meta()
p = web.HTMLParser('./tests/data/clean.html')
with self.assertRaises(ValueError):
p.remove_all()
os.remove('./tests/data/clean.html')
def test_epub(self):
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
'harmful content')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.epub')
......@@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies, video, archive, html
from libmat2 import check_dependencies, video, archive, web, epub
class TestCheckDependencies(unittest.TestCase):
......@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'this is a test comment')
def test_epub(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
def test_css(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
self.assertEqual(mimetype, 'text/css')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
self.assertEqual(meta['version'], '1.0')
self.assertEqual(meta['harmful data'], 'underline is cool')
class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
......@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
def test_html(self):
shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
p = html.HTMLParser('./tests/data/clean.html')
p = web.HTMLParser('./tests/data/clean.html')
meta = p.get_meta()
self.assertEqual(meta['author'], 'jvoisin')
......@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
ret = p.remove_all()
self.assertTrue(ret)
p = html.HTMLParser('./tests/data/clean.cleaned.html')
p = web.HTMLParser('./tests/data/clean.cleaned.html')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')
def test_epub(self):
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
p = epub.EPUBParser('./tests/data/clean.epub')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
ret = p.remove_all()
self.assertTrue(ret)
p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.epub')
os.remove('./tests/data/clean.cleaned.epub')
os.remove('./tests/data/clean.cleaned.cleaned.epub')
def test_css(self):
shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
p = web.CSSParser('./tests/data/clean.css')
self.assertEqual(p.get_meta(), {
'harmful data': 'underline is cool',
'version': '1.0',
'author': 'jvoisin'})
ret = p.remove_all()
self.assertTrue(ret)
p = web.CSSParser('./tests/data/clean.cleaned.css')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.css')
os.remove('./tests/data/clean.cleaned.css')
os.remove('./tests/data/clean.cleaned.cleaned.css')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment