epub.py 2.98 KB
Newer Older
jvoisin's avatar
jvoisin committed
1 2
import logging
import re
jvoisin's avatar
jvoisin committed
3
import uuid
jvoisin's avatar
jvoisin committed
4 5 6 7
import xml.etree.ElementTree as ET  # type: ignore

from . import archive, office

jvoisin's avatar
jvoisin committed
8
class EPUBParser(archive.ZipParser):
jvoisin's avatar
jvoisin committed
9
    mimetypes = {'application/epub+zip', }
jvoisin's avatar
jvoisin committed
10
    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
jvoisin's avatar
jvoisin committed
11 12 13 14 15 16 17

    def __init__(self, filename):
        super().__init__(filename)
        self.files_to_keep = set(map(re.compile, {  # type: ignore
            'META-INF/container.xml',
            'mimetype',
            'OEBPS/content.opf',
jvoisin's avatar
jvoisin committed
18
            'content.opf',
jvoisin's avatar
jvoisin committed
19
            }))
jvoisin's avatar
jvoisin committed
20
        self.uniqid = uuid.uuid4()
jvoisin's avatar
jvoisin committed
21 22

    def _specific_get_meta(self, full_path, file_path):
jvoisin's avatar
jvoisin committed
23
        if not file_path.endswith('content.opf'):
jvoisin's avatar
jvoisin committed
24 25 26 27 28 29 30 31 32 33 34
            return {}

        with open(full_path, encoding='utf-8') as f:
            try:
                results = re.findall(r"<((?:meta|dc|cp).+?)[^>]*>(.+)</\1>",
                                     f.read(), re.I|re.M)
                return {k:v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
                return {file_path: 'harmful content', }

    def _specific_cleanup(self, full_path: str):
jvoisin's avatar
jvoisin committed
35
        if full_path.endswith('content.opf'):
jvoisin's avatar
jvoisin committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
            return self.__handle_contentopf(full_path)
        elif full_path.endswith('OEBPS/toc.ncx'):
            return self.__handle_tocncx(full_path)
        return True

    def __handle_tocncx(self, full_path: str):
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:  # pragma: nocover
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False

        for item in tree.iterfind('.//', namespace):  # pragma: nocover
            if item.tag.strip().lower().endswith('head'):
                item.clear()
                ET.SubElement(item, 'meta', attrib={'name': '', 'content': ''})
                break
        tree.write(full_path, xml_declaration=True, encoding='utf-8',
                   short_empty_elements=False)
        return True
jvoisin's avatar
jvoisin committed
56

jvoisin's avatar
jvoisin committed
57
    def __handle_contentopf(self, full_path: str):
jvoisin's avatar
jvoisin committed
58 59 60 61 62 63
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
            logging.error("Unable to parse %s in %s.", full_path, self.filename)
            return False

jvoisin's avatar
jvoisin committed
64
        for item in tree.iterfind('.//', namespace):  # pragma: nocover
jvoisin's avatar
jvoisin committed
65
            if item.tag.strip().lower().endswith('metadata'):
jvoisin's avatar
jvoisin committed
66 67 68 69 70 71 72 73 74 75 76 77
                item.clear()

                # item with mandatory content
                uniqid = ET.Element(self.metadata_namespace + 'identifier')
                uniqid.text = str(self.uniqid)
                uniqid.set('id', 'id')
                item.append(uniqid)

                # items without mandatory content
                for name in {'language', 'title'}:
                    uniqid = ET.Element(self.metadata_namespace + name)
                    item.append(uniqid)
jvoisin's avatar
jvoisin committed
78
                break  # there is only a single <metadata> block
jvoisin's avatar
jvoisin committed
79
        tree.write(full_path, xml_declaration=True, encoding='utf-8')
jvoisin's avatar
jvoisin committed
80
        return True