Add a preliminary implementation of epub support

b983d283 · Julien (jvoisin) Voisin · 6b45064c · b983d283 · b983d283 · b983d283
Commit b983d283 authored 6 years ago by Julien (jvoisin) Voisin
--- a/libmat2/epub.py
+++ b/libmat2/epub.py
+import logging
+import re
+import xml.etree.ElementTree as ET  # type: ignore
+from . import archive, office
+class EPUBParser(archive.ArchiveBasedAbstractParser):
+    mimetypes = {'application/epub+zip', }
+    def __init__(self, filename):
+        super().__init__(filename)
+        self.files_to_keep = set(map(re.compile, {  # type: ignore
+            'META-INF/container.xml',
+            'mimetype',
+            'OEBPS/content.opf',
+            }))
+    def _specific_get_meta(self, full_path, file_path):
+        if file_path != 'OEBPS/content.opf':
+            return {}
+        with open(full_path, encoding='utf-8') as f:
+            try:
+                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
+                                     f.read(), re.I|re.M)
+                return {k:v for (k, v) in results}
+            except (TypeError, UnicodeDecodeError):
+                # We didn't manage to parse the xml file
+                return {file_path: 'harmful content', }
+    def _specific_cleanup(self, full_path: str):
+        if not full_path.endswith('OEBPS/content.opf'):
+            return True
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        parent_map = {c:p for p in tree.iter() for c in p}
+        for item in tree.iterfind('.//', namespace):
+            if item.tag.strip().lower().endswith('metadata'):
+                parent_map[item].remove(item)
+                break
+        tree.write(full_path, xml_declaration=True)
+        return True
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
@@ -5,7 +5,7 @@ from . import abstract
 class HarmlessParser(abstract.AbstractParser):
    """ This is the parser for filetypes that can not contain metadata. """
-    mimetypes = {'text/plain', 'image/x-ms-bmp'}
+    mimetypes = {'text/plain', 'image/x-ms-bmp', }
    def get_meta(self) -> Dict[str, Union[str, dict]]:
        return dict()

--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
@@ -10,6 +10,9 @@ assert Tuple  # make pyflakes happy
 T = TypeVar('T', bound='abstract.AbstractParser')
+mimetypes.add_type('application/epub+zip', '.epub')
+mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
 def __load_all_parsers():
    """ Loads every parser in a dynamic way """

--- a/libmat2/html.py
+++ b/libmat2/html.py
 from html import parser
 from typing import Dict, Any, List, Tuple
+import re
+import string
 from . import abstract
+class CSSParser(abstract.AbstractParser):
+    mimetypes = {'text/css', }
+    flags = re.MULTILINE | re.DOTALL
+    def remove_all(self) -> bool:
+        with open(self.filename, encoding='utf-8') as f:
+            cleaned = re.sub(r'/\*.+?\*/', '', f.read(), 0, self.flags)
+        with open(self.output_filename, 'w', encoding='utf-8') as f:
+            f.write(cleaned)
+        return True
+    def get_meta(self) -> Dict[str, Any]:
+        metadata = {}
+        with open(self.filename, encoding='utf-8') as f:
+            cssdoc = re.findall(r'/\*(.+?)\*/', f.read(), self.flags)
+        for match in cssdoc:
+            for line in match.splitlines():
+                try:
+                    k, v = line.split(':')
+                    metadata[k.strip(string.whitespace + '*')] = v.strip()
+                except ValueError:
+                    metadata['harmful data'] = line.strip()
+        return metadata
 class HTMLParser(abstract.AbstractParser):
-    mimetypes = {'text/html', }
+    mimetypes = {'text/html', 'application/x-dtbncx+xml', }
    def __init__(self, filename):
        super().__init__(filename)
-        self.__parser = _HTMLParser()
+        self.__parser = _HTMLParser(self.filename)
-        with open(filename) as f:
+        with open(filename, encoding='utf-8') as f:
            self.__parser.feed(f.read())
        self.__parser.close()
@@ -25,45 +52,69 @@ class _HTMLParser(parser.HTMLParser):
    we're using an internal queue to track all the opening/closing tags,
    and hoping for the best.
    """
-    def __init__(self):
+    tag_blacklist = {'doctitle', 'meta'}  # everything is lowercase
+    def __init__(self, filename):
        super().__init__()
+        self.filename = filename
        self.__textrepr = ''
        self.__meta = {}
        self.__validation_queue = []
+        # We're using a counter instead of a boolean to handle nested tags
+        self.__in_dangerous_tag = 0
    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
-        self.__textrepr += self.get_starttag_text()
        self.__validation_queue.append(tag)
+        if tag in self.tag_blacklist:
+            self.__in_dangerous_tag += 1
+            return
+        if self.__in_dangerous_tag == 0:
+            self.__textrepr += self.get_starttag_text()
    def handle_endtag(self, tag: str):
        if not self.__validation_queue:
-            raise ValueError
+            raise ValueError("The closing tag %s doesn't have a corresponding "
-        elif tag != self.__validation_queue.pop():
+                             "opening one in %s." % (tag, self.filename))
-            raise ValueError
+        previous_tag = self.__validation_queue.pop()
+        if tag != previous_tag:
+            raise ValueError("The closing tag %s doesn't match the previous "
+                             "tag %s in %s" %
+                             (tag, previous_tag, self.filename))
+        elif tag in self.tag_blacklist:
+            self.__in_dangerous_tag -= 1
+            return
+        if self.__in_dangerous_tag == 0:
            # There is no `get_endtag_text()` method :/
            self.__textrepr += '</' + tag + '>\n'
    def handle_data(self, data: str):
-        if data.strip():
+        if self.__in_dangerous_tag == 0 and data.strip():
            self.__textrepr += data
    def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
-        if tag == 'meta':
+        if tag in self.tag_blacklist:
            meta = {k:v for k, v in attrs}
            name = meta.get('name', 'harmful metadata')
            content = meta.get('content', 'harmful data')
            self.__meta[name] = content
        else:
+            if self.__in_dangerous_tag == 0:
                self.__textrepr += self.get_starttag_text()
    def remove_all(self, output_filename: str) -> bool:
        if self.__validation_queue:
-            raise ValueError
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
-        with open(output_filename, 'w') as f:
+                ', '.join(self.__validation_queue),
+                self.filename))
+        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(self.__textrepr)
        return True
    def get_meta(self) -> Dict[str, Any]:
        if self.__validation_queue:
-            raise ValueError
+            raise ValueError("Some tags (%s) were left unclosed in %s" % (
+                ', '.join(self.__validation_queue),
+                self.filename))
        return self.__meta
--- a/tests/data/dirty.css
+++ b/tests/data/dirty.css
+/**
+ * This is my super css framework
+ * version: 1.0
+ * author : jvoisin
+ */
+body {
+	color: red;
+	background-color: blue;
+}
+.underline {
+	text-decoration: underline; /* underline is cool */	
+}
--- a/tests/data/dirty.epub
+++ b/tests/data/dirty.epub
--- a/tests/dirty.epub
+++ b/tests/dirty.epub
--- a/tests/test_corrupted_files.py
+++ b/tests/test_corrupted_files.py
@@ -7,7 +7,7 @@ import logging
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent
-from libmat2 import harmless, video, html
+from libmat2 import harmless, video, web
 # No need to logging messages, should something go wrong,
 # the testsuite _will_ fail.
@@ -220,34 +220,34 @@ class TestCorruptedFiles(unittest.TestCase):
        os.remove('./tests/data/--output.avi')
    def test_zip(self):
-        with zipfile.ZipFile('./tests/data/dirty.zip', 'w') as zout:
+        with zipfile.ZipFile('./tests/data/clean.zip', 'w') as zout:
            zout.write('./tests/data/dirty.flac')
            zout.write('./tests/data/dirty.docx')
            zout.write('./tests/data/dirty.jpg')
            zout.write('./tests/data/embedded_corrupted.docx')
-        p, mimetype = parser_factory.get_parser('./tests/data/dirty.zip')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.zip')
        self.assertEqual(mimetype, 'application/zip')
        meta = p.get_meta()
        self.assertEqual(meta['tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
        self.assertEqual(meta['tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
        self.assertFalse(p.remove_all())
-        os.remove('./tests/data/dirty.zip')
+        os.remove('./tests/data/clean.zip')
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<open>but not</closed>')
        with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        # Yes, we're able to deal with malformed html :/
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
        with open('./tests/data/clean.html', 'a') as f:
            f.write('<meta name=\'this" is="weird"/>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        self.assertTrue(p.remove_all())
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
@@ -255,17 +255,38 @@ class TestCorruptedFiles(unittest.TestCase):
        with open('./tests/data/clean.html', 'w') as f:
            f.write('</close>')
        with self.assertRaises(ValueError):
-            html.HTMLParser('./tests/data/clean.html')
+            web.HTMLParser('./tests/data/clean.html')
        os.remove('./tests/data/clean.html')
        with open('./tests/data/clean.html', 'w') as f:
            f.write('<notclosed>')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.get_meta()
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        with self.assertRaises(ValueError):
            p.remove_all()
        os.remove('./tests/data/clean.html')
+        with open('./tests/data/clean.html', 'w') as f:
+            f.write('<doctitle><br/></doctitle><br/><notclosed>')
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.get_meta()
+        p = web.HTMLParser('./tests/data/clean.html')
+        with self.assertRaises(ValueError):
+            p.remove_all()
+        os.remove('./tests/data/clean.html')
+    def test_epub(self):
+        with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
+            zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
+        p, mimetype = parser_factory.get_parser('./tests/data/clean.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['OEBPS/content.opf'],
+                'harmful content')
+        self.assertFalse(p.remove_all())
+        os.remove('./tests/data/clean.epub')
--- a/tests/test_libmat2.py
+++ b/tests/test_libmat2.py
@@ -6,7 +6,7 @@ import os
 import zipfile
 from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
-from libmat2 import check_dependencies, video, archive, html
+from libmat2 import check_dependencies, video, archive, web, epub
 class TestCheckDependencies(unittest.TestCase):
@@ -177,6 +177,23 @@ class TestGetMeta(unittest.TestCase):
        meta = p.get_meta()
        self.assertEqual(meta['Comment'], 'this is a test comment')
+    def test_epub(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
+        self.assertEqual(mimetype, 'application/epub+zip')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
+        self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
+        self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
+    def test_css(self):
+        p, mimetype = parser_factory.get_parser('./tests/data/dirty.css')
+        self.assertEqual(mimetype, 'text/css')
+        meta = p.get_meta()
+        self.assertEqual(meta['author'], 'jvoisin')
+        self.assertEqual(meta['version'], '1.0')
+        self.assertEqual(meta['harmful data'], 'underline is cool')
 class TestRemovingThumbnails(unittest.TestCase):
    def test_odt(self):
        shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
@@ -599,7 +616,7 @@ class TestCleaning(unittest.TestCase):
    def test_html(self):
        shutil.copy('./tests/data/dirty.html', './tests/data/clean.html')
-        p = html.HTMLParser('./tests/data/clean.html')
+        p = web.HTMLParser('./tests/data/clean.html')
        meta = p.get_meta()
        self.assertEqual(meta['author'], 'jvoisin')
@@ -607,10 +624,50 @@ class TestCleaning(unittest.TestCase):
        ret = p.remove_all()
        self.assertTrue(ret)
-        p = html.HTMLParser('./tests/data/clean.cleaned.html')
+        p = web.HTMLParser('./tests/data/clean.cleaned.html')
        self.assertEqual(p.get_meta(), {})
        self.assertTrue(p.remove_all())
        os.remove('./tests/data/clean.html')
        os.remove('./tests/data/clean.cleaned.html')
        os.remove('./tests/data/clean.cleaned.cleaned.html')
+    def test_epub(self):
+        shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
+        p = epub.EPUBParser('./tests/data/clean.epub')
+        meta = p.get_meta()
+        self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.epub')
+        os.remove('./tests/data/clean.cleaned.epub')
+        os.remove('./tests/data/clean.cleaned.cleaned.epub')
+    def test_css(self):
+        shutil.copy('./tests/data/dirty.css', './tests/data/clean.css')
+        p = web.CSSParser('./tests/data/clean.css')
+        self.assertEqual(p.get_meta(), {
+            'harmful data': 'underline is cool',
+            'version': '1.0',
+            'author': 'jvoisin'})
+        ret = p.remove_all()
+        self.assertTrue(ret)
+        p = web.CSSParser('./tests/data/clean.cleaned.css')
+        self.assertEqual(p.get_meta(), {})
+        self.assertTrue(p.remove_all())
+        os.remove('./tests/data/clean.css')
+        os.remove('./tests/data/clean.cleaned.css')
+        os.remove('./tests/data/clean.cleaned.cleaned.css')