Commit 1cdf3d80 authored by jvoisin's avatar jvoisin

Add a preliminary implementation of epub support

parent 6e63e03b
Pipeline #22560 failed with stages
in 4 minutes and 9 seconds
......@@ -4,6 +4,7 @@ stages:
- linting
- test
bandit:
stage: linting
script: # TODO: remove B405 and B314
......@@ -74,3 +75,4 @@ tests:archlinux:
script:
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
- python3 setup.py test
import re
import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
mimetypes = {'application/epub+zip', }
def __init__(self, filename):
super().__init__(filename)
self.files_to_keep = set(map(re.compile, { # type: ignore
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
}))
def _specific_get_meta(self, full_path, file_path):
if file_path != 'OEBPS/content.opf':
return {}
with open(full_path, encoding='utf-8') as f:
try:
results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>", f.read(), re.I|re.M)
return {k:v for (k, v) in results}
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file
# We didn't manage to parse the xml file
return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str):
if not full_path.endswith('OEBPS/content.opf'):
return True
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
return False
parent_map = {c:p for p in tree.iter() for c in p}
for item in tree.iterfind('.//', namespace):
if item.tag.strip().lower().endswith('metadata'):
parent_map[item].remove(item)
break
tree.write(full_path, xml_declaration=True)
return True
......@@ -5,7 +5,7 @@ from . import abstract
class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that can not contain metadata. """
mimetypes = {'text/plain', 'image/x-ms-bmp'}
mimetypes = {'text/plain', 'image/x-ms-bmp', 'text/css'}
def get_meta(self) -> Dict[str, Union[str, dict]]:
return dict()
......
......@@ -5,7 +5,7 @@ from . import abstract
class HTMLParser(abstract.AbstractParser):
mimetypes = {'text/html', }
mimetypes = {'text/html', 'application/x-dtbncx+xml', }
def __init__(self, filename):
super().__init__(filename)
self.__parser = _HTMLParser()
......@@ -25,36 +25,49 @@ class _HTMLParser(parser.HTMLParser):
we're using an internal queue to track all the opening/closing tags,
and hoping for the best.
"""
tag_blacklist = {'doctitle', 'meta'} # everything is lowercase
def __init__(self):
super().__init__()
self.__textrepr = ''
self.__meta = {}
self.__validation_queue = []
# We're using a counter to handle nested tags
self.__in_dangerous_tag = 0
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
self.__textrepr += self.get_starttag_text()
self.__validation_queue.append(tag)
if tag in self.tag_blacklist:
self.__in_dangerous_tag += 1
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()
def handle_endtag(self, tag: str):
if not self.__validation_queue:
raise ValueError
elif tag != self.__validation_queue.pop():
raise ValueError
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
elif tag in self.tag_blacklist:
if self.__in_dangerous_tag == 0:
raise ValueError
self.__in_dangerous_tag -= 1
elif self.__in_dangerous_tag == 0:
# There is no `get_endtag_text()` method :/
self.__textrepr += '</' + tag + '>\n'
def handle_data(self, data: str):
if data.strip():
if self.__in_dangerous_tag == 0 and data.strip():
self.__textrepr += data
def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
if tag == 'meta':
if tag in self.tag_blacklist:
meta = {k:v for k, v in attrs}
name = meta.get('name', 'harmful metadata')
content = meta.get('content', 'harmful data')
self.__meta[name] = content
else:
self.__textrepr += self.get_starttag_text()
if self.__in_dangerous_tag == 0:
self.__textrepr += self.get_starttag_text()
def remove_all(self, output_filename: str) -> bool:
if self.__validation_queue:
......
......@@ -10,6 +10,9 @@ assert Tuple # make pyflakes happy
T = TypeVar('T', bound='abstract.AbstractParser')
mimetypes.add_type('application/epub+zip', '.epub')
mimetypes.add_type('application/x-dtbncx+xml', '.ncx')
def __load_all_parsers():
""" Loads every parser in a dynamic way """
......
......@@ -6,7 +6,7 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies, video, archive, html
from libmat2 import check_dependencies, video, archive, html, epub
class TestCheckDependencies(unittest.TestCase):
......@@ -177,6 +177,16 @@ class TestGetMeta(unittest.TestCase):
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'this is a test comment')
def test_epub(self):
p, mimetype = parser_factory.get_parser('./tests/data/dirty.epub')
self.assertEqual(mimetype, 'application/epub+zip')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:creator'], 'Dorothy L. Sayers')
self.assertEqual(meta['OEBPS/toc.ncx']['dtb:generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@images@shield25.jpg']['CreatorTool'], 'Adobe Photoshop CS5 Macintosh')
self.assertEqual(meta['OEBPS/@public@vhost@g@gutenberg@html@files@58820@58820-h@58820-h-2.htm.html']['generator'], 'Ebookmaker 0.4.0a5 by Marcello Perathoner <webmaster@gutenberg.org>')
class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
......@@ -614,3 +624,22 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.html')
os.remove('./tests/data/clean.cleaned.html')
os.remove('./tests/data/clean.cleaned.cleaned.html')
def test_epub(self):
shutil.copy('./tests/data/dirty.epub', './tests/data/clean.epub')
p = epub.EPUBParser('./tests/data/clean.epub')
meta = p.get_meta()
self.assertEqual(meta['OEBPS/content.opf']['dc:source'], 'http://www.gutenberg.org/files/58820/58820-h/58820-h.htm')
ret = p.remove_all()
self.assertTrue(ret)
p = epub.EPUBParser('./tests/data/clean.cleaned.epub')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.epub')
os.remove('./tests/data/clean.cleaned.epub')
os.remove('./tests/data/clean.cleaned.cleaned.epub')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment