Skip to content
Snippets Groups Projects
web.py 6.67 KiB
Newer Older
  • Learn to ignore specific revisions
  • from html import parser, escape
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
    from typing import Dict, Any, List, Tuple, Set, Optional
    
    import re
    import string
    
    from . import abstract
    
    
    assert Set
    
    # pylint: disable=too-many-instance-attributes
    
    
    class CSSParser(abstract.AbstractParser):
        """There is no such things as metadata in CSS files,
        only comments of the form `/* … */`, so we're removing the laters."""
        mimetypes = {'text/css', }
        flags = re.MULTILINE | re.DOTALL
    
        def remove_all(self) -> bool:
            with open(self.filename, encoding='utf-8') as f:
    
                cleaned = re.sub(r'/\*.*?\*/', '', f.read(), 0, self.flags)
    
            with open(self.output_filename, 'w', encoding='utf-8') as f:
                f.write(cleaned)
            return True
    
        def get_meta(self) -> Dict[str, Any]:
            metadata = {}
            with open(self.filename, encoding='utf-8') as f:
    
                cssdoc = re.findall(r'/\*(.*?)\*/', f.read(), self.flags)
    
            for match in cssdoc:
                for line in match.splitlines():
                    try:
                        k, v = line.split(':')
                        metadata[k.strip(string.whitespace + '*')] = v.strip()
                    except ValueError:
                        metadata['harmful data'] = line.strip()
            return metadata
    
    
    
    class AbstractHTMLParser(abstract.AbstractParser):
    
        tags_blocklist = set()  # type: Set[str]
    
        # In some html/xml-based formats some tags are mandatory,
    
    georg's avatar
    georg committed
        # so we're keeping them, but are discarding their content
    
        tags_required_blocklist = set()  # type: Set[str]
    
        def __init__(self, filename):
            super().__init__(filename)
    
            self.__parser = _HTMLParser(self.filename, self.tags_blocklist,
                                        self.tags_required_blocklist)
    
            with open(filename, encoding='utf-8') as f:
                self.__parser.feed(f.read())
            self.__parser.close()
    
        def get_meta(self) -> Dict[str, Any]:
            return self.__parser.get_meta()
    
        def remove_all(self) -> bool:
            return self.__parser.remove_all(self.output_filename)
    
    
    
    class HTMLParser(AbstractHTMLParser):
    
        mimetypes = {'text/html', 'application/xhtml+xml'}
    
        tags_blocklist = {'meta', }
        tags_required_blocklist = {'title', }
    
    
    
    class DTBNCXParser(AbstractHTMLParser):
        mimetypes = {'application/x-dtbncx+xml', }
    
        tags_required_blocklist = {'title', 'doctitle', 'meta'}
    
    class _HTMLParser(parser.HTMLParser):
        """Python doesn't have a validating html parser in its stdlib, so
        we're using an internal queue to track all the opening/closing tags,
        and hoping for the best.
    
    
        Moreover, the parser.HTMLParser call doesn't provide a get_endtag_text
        method, so we have to use get_starttag_text instead, put its result in a
        LIFO, and transform it in a closing tag when needed.
    
        Also, gotcha: the `tag` parameters are always in lowercase.
    
        def __init__(self, filename, blocklisted_tags, required_blocklisted_tags):
    
            super().__init__()
            self.filename = filename
            self.__textrepr = ''
            self.__meta = {}
    
            self.__validation_queue = []  # type: List[str]
    
            # We're using counters instead of booleans, to handle nested tags
            self.__in_dangerous_but_required_tag = 0
    
            self.__in_dangerous_tag = 0
    
    
            if required_blocklisted_tags & blocklisted_tags:  # pragma: nocover
    
                raise ValueError("There is an overlap between %s and %s" % (
    
                    required_blocklisted_tags, blocklisted_tags))
            self.tag_required_blocklist = required_blocklisted_tags
            self.tag_blocklist = blocklisted_tags
    
    Julien (jvoisin) Voisin's avatar
    Julien (jvoisin) Voisin committed
        def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]):
    
            # Ignore the type, because mypy is too stupid to infer
            # that get_starttag_text() can't return None.
            original_tag = self.get_starttag_text()  # type: ignore
            self.__validation_queue.append(original_tag)  # type: ignore
    
            if tag in self.tag_blocklist:
    
                self.__in_dangerous_tag += 1
    
            if self.__in_dangerous_tag == 0:
    
                if self.__in_dangerous_but_required_tag == 0:
    
                    self.__textrepr += original_tag
    
            if tag in self.tag_required_blocklist:
    
                self.__in_dangerous_but_required_tag += 1
    
    
        def handle_endtag(self, tag: str):
            if not self.__validation_queue:
                raise ValueError("The closing tag %s doesn't have a corresponding "
                                 "opening one in %s." % (tag, self.filename))
    
            previous_tag = self.__validation_queue.pop()
    
            previous_tag = previous_tag[1:-1]  # remove < and >
            previous_tag = previous_tag.split(' ')[0]  # remove attributes
            if tag != previous_tag.lower():
    
                raise ValueError("The closing tag %s doesn't match the previous "
                                 "tag %s in %s" %
                                 (tag, previous_tag, self.filename))
    
    
            if tag in self.tag_required_blocklist:
    
                self.__in_dangerous_but_required_tag -= 1
    
    
            if self.__in_dangerous_tag == 0:
    
                if self.__in_dangerous_but_required_tag == 0:
    
                    # There is no `get_endtag_text()` method :/
                    self.__textrepr += '</' + previous_tag + '>'
    
    
            if tag in self.tag_blocklist:
    
                self.__in_dangerous_tag -= 1
    
    
        def handle_data(self, data: str):
    
            if self.__in_dangerous_but_required_tag == 0:
                if self.__in_dangerous_tag == 0:
                    if data.strip():
                        self.__textrepr += escape(data)
    
    
        def handle_startendtag(self, tag: str, attrs: List[Tuple[str, str]]):
    
            if tag in self.tag_required_blocklist | self.tag_blocklist:
    
                meta = {k:v for k, v in attrs}
                name = meta.get('name', 'harmful metadata')
                content = meta.get('content', 'harmful data')
                self.__meta[name] = content
    
                if self.__in_dangerous_tag == 0:
    
                    if tag in self.tag_required_blocklist:
    
                        self.__textrepr += '<' + tag + ' />'
    
            if self.__in_dangerous_tag == 0:
                if self.__in_dangerous_but_required_tag == 0:
    
                    self.__textrepr += self.get_starttag_text()
    
        def remove_all(self, output_filename: str) -> bool:
            if self.__validation_queue:
                raise ValueError("Some tags (%s) were left unclosed in %s" % (
                    ', '.join(self.__validation_queue),
                    self.filename))
            with open(output_filename, 'w', encoding='utf-8') as f:
                f.write(self.__textrepr)
            return True
    
        def get_meta(self) -> Dict[str, Any]:
            if self.__validation_queue:
                raise ValueError("Some tags (%s) were left unclosed in %s" % (
                    ', '.join(self.__validation_queue),
                    self.filename))
            return self.__meta