Compare revisions

0902e9e3 · 693408f1 · ee704db2 · 4acf3af0 · e0f4f0e3 · 6dd48de4
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 variables:
  CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
+  GIT_DEPTH: "5"
+  GIT_STRATEGY: clone
 stages:
  - linting
@@ -10,42 +12,22 @@ stages:
    - useradd --home-dir ${CI_PROJECT_DIR} mat2
    - chown -R mat2 .
-linting:bandit:
+linting:ruff:
-  image: $CONTAINER_REGISTRY:linting 
-  stage: linting
-  script:  # TODO: remove B405 and B314
-    - bandit ./mat2 --format txt --skip B101
-    - bandit -r ./nautilus/ --format txt --skip B101
-    - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
-linting:codespell:
-  image: $CONTAINER_REGISTRY:linting
-  stage: linting
-  script:
-    # Run codespell to check for spelling errors; ignore errors about binary
-    # files, use a config with ignored words and exclude the git directory,
-    # which might contain false positives
-    - codespell -q 2 -I utils/ci/codespell/ignored_words.txt -S .git
-linting:pylint:
-  image: $CONTAINER_REGISTRY:linting
-  stage: linting
-  script:
-    - pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
-    # Once nautilus-python is in Debian, decomment it form the line below
-    - pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
-linting:pyflakes:
  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-    - pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus
+    - apt update
+    - apt install -qqy --no-install-recommends python3-venv
+    - python3 -m venv venv
+    - source venv/bin/activate
+    - pip3 install ruff
+    - ruff check .
 linting:mypy:
  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-    - mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py
+    - mypy --ignore-missing-imports mat2 libmat2/*.py
 tests:archlinux:
  image: $CONTAINER_REGISTRY:archlinux
@@ -56,17 +38,20 @@ tests:archlinux:
 tests:debian:
  image: $CONTAINER_REGISTRY:debian
  stage: test
+  <<: *prepare_env
  script:
    - apt-get -qqy purge bubblewrap
-    - python3 -m unittest discover -v
+    - su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
+    - su - mat2 -c "python3-coverage report --fail-under=95 -m --include 'libmat2/*'"
 tests:debian_with_bubblewrap:
  image: $CONTAINER_REGISTRY:debian
  stage: test
+  allow_failure: true
  <<: *prepare_env
  script:
-    - su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
+    - apt-get -qqy install bubblewrap
-    - su - mat2 -c "python3-coverage report --fail-under=100 -m --include 'libmat2/*'"
+    - python3 -m unittest discover -v
 tests:fedora:
  image: $CONTAINER_REGISTRY:fedora
@@ -80,3 +65,51 @@ tests:gentoo:
  <<: *prepare_env
  script:
    - su - mat2 -c "python3 -m unittest discover -v"
+tests:python3.7:
+  image: $CONTAINER_REGISTRY:python3.7
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.8:
+  image: $CONTAINER_REGISTRY:python3.8
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.9:
+  image: $CONTAINER_REGISTRY:python3.9
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.10:
+  image: $CONTAINER_REGISTRY:python3.10
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.11:
+  image: $CONTAINER_REGISTRY:python3.11
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.12:
+  image: $CONTAINER_REGISTRY:python3.12
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.13:
+  image: $CONTAINER_REGISTRY:python3.13
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+tests:python3.14:
+  image: $CONTAINER_REGISTRY:python3.14
+  stage: test
+  script:
+    - python3 -m unittest discover -v
--- a/.pylintrc
+++ b/.pylintrc
@@ -14,4 +14,5 @@ disable=
    catching-non-exception,
    cell-var-from-loop,
    locally-disabled,
+		raise-missing-from,
    invalid-sequence-index,  # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 0.13.5 - 2025-01-09
+- Keep orientation metadata on jpeg and tiff files
+- Improve cairo-related error/exceptions handling
+- Improve the logging
+- Improve the sandboxing
+- Improve Python3.12 support
+- Improve MSOffice documents handling
+# 0.13.4 - 2023-08-02
+- Add documentation about mat2 on OSX
+- Make use of python3.7 constructs to simplify code
+- Use moderner type annotations
+- Harden get_meta in archive.py against variants of CVE-2022-35410 
+- Improve MSOffice document support
+- Package the manpage on pypi 
+# 0.13.3 - 2023-02-23
+- Fix a decorator argument
+# 0.13.2 - 2023-01-28
+- Fix a crash on some python versions
+# 0.13.1 - 2023-01-07
+- Improve xlsx support
+- Remove the Nautilus extension
+# 0.13.0 - 2022-07-06
+- Fix an arbitrary file read (CVE-2022-35410)
+- Add support for heic files 
+# 0.12.4 - 2022-04-30
+- Fix possible errors/crashes when processing multiple files
+  via the command line interface
+- Use a fixed PDF version for the output
+- Improve compatibility with modern versions of rsvg
+- Improve the robustness of the command line interface with
+  regard to control characters
+# 0.12.3 - 2022-01-06
+- Implement code for internationalization
+- Keep individual files compression type in zip files
+- Increase the robustness of mat2 against weird/corrupted files
+- Fix the dolphin integration
+- Add a fuzzer
+# 0.12.2 - 2021-08-29
+- Add support for aiff files
+- Improve MS Office support
+- Improve compatibility with newer/older version of mat2's dependencies
+- Fix possible issues with the resolution of processed pdf
+# 0.12.1 - 2021-03-19
+- Improve epub support
+- Improve MS Office support
+# 0.12.0 - 2020-12-18
+- Improve significantly MS Office formats support
+- Fix some typos in the Nautilus extension
+- Improve reliability of the mp3, pdf and svg parsers
+- Improve compatibility with ffmpeg when sandboxing is used
+- Improve the dolphin extension usability
+- libmat2 now raises a ValueError on malformed files while trying to 
+  find the right parser, instead of returning None
+# 0.11.0 - 2020-03-29
+- Improve significantly MS Office formats support
+- Refactor how mat2 looks for executables
+# 0.10.1 - 2020-02-09
+- Improve the documentation and the manpage
+- Improve the robustness of css, html, png, gdk-based, exiftool-based parsers
+- Future-proof a bit the testsuite
+- Handle tiff files with a .tif extension
+- Improve the sandbox' usability
+- Add support for wav files
 # 0.10.0 - 2019-11-30
 - Make mat2 work on Python3.8

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,8 +4,14 @@ The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
 but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
 Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
-and to send a pull-request. Please do check that everything is fine by running the
+and to send a pull-request.
-testsuite with `python3 -m unittest discover -v` before submitting one :)
+Before sending the pull-request, please do check that everything is fine by
+running the full test suite in GitLab. To do that, after forking mat2 in GitLab,
+you need to go in Settings -> CI/CD -> Runner and there enable shared runners.
+Mat2 also has unit tests (that are also run in the full test suite). You can run
+them with `python3 -m unittest discover -v`.
 If you're fixing a bug or adding a new feature, please add tests accordingly,
 this will greatly improve the odds of your merge-request getting merged.
@@ -24,15 +30,16 @@ Since mat2 is written in Python3, please conform as much as possible to the
 1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
 2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
 3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
-4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
+4. Update the version in the [pyproject.toml](https://0xacab.org/jvoisin/mat2/blob/master/yproject.toml) file
-5. Commit the changelog, man page, mat2 and setup.py files
+5. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
-6. Create a tag with `git tag -s $VERSION`
+6. Commit the modified files
-7. Push the commit with `git push origin master`
+7. Create a tag with `git tag -s $VERSION`
-8. Push the tag with `git push --tags`
+8. Push the commit with `git push origin master`
-9. Download the gitlab archive of the release
+9. Push the tag with `git push --tags`
-10. Diff it against the local copy
+10. Download the gitlab archive of the release
-11. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
+11. Diff it against the local copy
-12. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
+12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
-13. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
+13. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
-14. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
+14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
-15. Do the secret release dance
+15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
+16. Do the secret release dance
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -18,34 +18,53 @@ installed, mat2 uses it to sandbox any external processes it invokes.
 ## Arch Linux
-Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
+Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
-[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
+[Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).
 ## Debian
-There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all).
+There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all) and you can install mat2 with:
+```
+apt install mat2
+```
 ## Fedora
 Thanks to [atenart](https://ack.tf/), there is a package available on
 [Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
-We use copr (cool other packages repo) as the Mat2 Nautilus plugin depends on
+First you need to enable mat2's copr:
-python3-nautilus, which isn't available yet in Fedora (but is distributed
-through this copr).
-First you need to enable Mat2's copr:
 ```
 dnf -y copr enable atenart/mat2
 ```
-Then you can install both the Mat2 command and Nautilus extension:
+Then you can install mat2:
 ```
-dnf -y install mat2 mat2-nautilus
+dnf -y install mat2
 ```
 ## Gentoo
 mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
+# OSX
+## Homebrew
+mat2 is [available on homebrew](https://formulae.brew.sh/formula/mat2):
+```
+brew install mat2
+```
+## MacPorts
+mat2 is [available on MacPorts](https://ports.macports.org/port/mat2/):
+```
+port install mat2
+```
--- a/README.md
+++ b/README.md
 ```
 _____ _____ _____ ___
 |     |  _  |_   _|_  |  Keep your data,
-| | | |     | | | |  _|     trash your meta!
+| | | | |_| | | | |  _|     trash your meta!
-|_|_|_|__|__| |_| |___|
+|_|_|_|_| |_| |_| |___|
 ```
-This software is currently in **beta**, please don't use it for anything
-critical.
 # Metadata and privacy
 Metadata consist of information that characterizes data.
@@ -25,9 +22,14 @@ Maybe you don't want to disclose those information.
 This is precisely the job of mat2: getting rid, as much as possible, of
 metadata.
-mat2 provides a command line tool, and graphical user interfaces via a service
+mat2 provides:
-menu for Dolphin, the default file manager of KDE, and an extension for
+- a library called `libmat2`;
-Nautilus, the default file manager of GNOME.
+- a command line tool called `mat2`,
+- a service menu for Dolphin, KDE's default file manager
+If you prefer a regular graphical user interface, you might be interested in
+[Metadata Cleaner](https://metadatacleaner.romainvigier.fr/), which is using
+`mat2` under the hood.
 # Requirements
@@ -41,6 +43,12 @@ Nautilus, the default file manager of GNOME.
 Please note that mat2 requires at least Python3.5.
+# Requirements setup on macOS (OS X) using [Homebrew](https://brew.sh/)
+```bash
+brew install exiftool cairo pygobject3 poppler gdk-pixbuf librsvg ffmpeg
+```
 # Running the test suite
 ```bash
@@ -74,7 +82,7 @@ optional arguments:
                        (policy should be one of: abort, omit, keep) [Default:
                        abort]
  --inplace             clean in place, without backup
-  --no-sandbox          Disable bubblewrap's sandboxing.
+  --no-sandbox          Disable bubblewrap's sandboxing
  -v, --version         show program's version number and exit
  -l, --list            list all supported fileformats
  --check-dependencies  check if mat2 has all the dependencies it needs
@@ -87,6 +95,26 @@ Note that mat2 **will not** clean files in-place, but will produce, for
 example, with a file named "myfile.png" a cleaned version named
 "myfile.cleaned.png".
+## Web interface
+It's possible to run mat2 as a web service, via
+[mat2-web](https://0xacab.org/jvoisin/mat2-web).
+If you're using WordPress, you might be interested in [wp-mat](https://git.autistici.org/noblogs/wp-mat)
+and [wp-mat-server](https://git.autistici.org/noblogs/wp-mat-server).
+## Desktop GUI
+For GNU/Linux desktops, it's possible to use the
+[Metadata Cleaner](https://gitlab.com/rmnvgr/metadata-cleaner) GTK application.
+# Supported formats
+The following formats are supported: avi, bmp, css, epub/ncx, flac, gif, jpeg,
+m4a/mp2/mp3/…, mp4, odc/odf/odg/odi/odp/ods/odt/…, off/opus/oga/spx/…, pdf,
+png, ppm, pptx/xlsx/docx/…, svg/svgz/…, tar/tar.gz/tar.bz2/tar.xz/…, tiff,
+torrent, wav, wmv, zip, …
 # Notes about detecting metadata
 While mat2 is doing its very best to display metadata when the `--show` flag is
@@ -116,15 +144,21 @@ of the guarantee that mat2 won't modify the data of their files, there is the
 	watermarks from PDF.
 - [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
 	an open-source Android application to remove metadata from pictures.
+- [Dangerzone](https://dangerzone.rocks/), designed to sanitize harmful documents
+  into harmless ones.
 # Contact
 If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
-or the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
+or the [mailing list](https://www.autistici.org/mailman/listinfo/mat-dev)
 Should a more private contact be needed (eg. for reporting security issues),
 you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
 using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
+# Donations
+If you want to donate some money, please give it to [Tails]( https://tails.boum.org/donate/?r=contribute ).
 # License
 This program is free software: you can redistribute it and/or modify
@@ -146,6 +180,8 @@ Copyright 2016 Marie-Rose for mat2's logo
 The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
 and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
+The `narrated_powerpoint_presentation.pptx` file is in the public domain.
 # Thanks
 mat2 wouldn't exist without:
@@ -155,4 +191,3 @@ mat2 wouldn't exist without:
 - friends
 Many thanks to them!
--- a/doc/mat2.1
+++ b/doc/mat2.1
-.TH mat2 "1" "November 2019" "mat2 0.10.0" "User Commands"
+.TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"
 .SH NAME
 mat2 \- the metadata anonymisation toolkit 2
@@ -71,6 +71,14 @@ complex file formats.
 .PP
 This is why you shouldn't rely on metadata's presence to decide if your file must
 be cleaned or not.
+.PP
+Moreover, mat2 goes to great lengths to make sure that as much metadata as
+possible are removed. This might sometimes result in a loss of quality of the
+processed files. For example, textual based pdf file converted into image based
+one means that it'll be no longer possible to select text in them. If you're
+experiencing this, you might want to give the lightweight cleaning mode a try,
+but keep in mind by doing so, some metadata \fBwon't be cleaned\fR.
 .SH BUGS

--- a/dolphin/mat2.desktop
+++ b/dolphin/mat2.desktop
@@ -6,6 +6,8 @@ Type=Service
 [Desktop Action cleanMetadata]
 Name=Clean metadata
+Name[de]=Metadaten löschen
 Name[es]=Limpiar metadatos
 Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
-Exec=kdialog --yesno  "$( mat2 -s %U )" --title "Clean Metadata?" && mat2 %U
+Exec=kdialog --yesno  "$( mat2 -s %F )" --title "Clean Metadata?" && mat2 %U
+Exec[de]=kdialog --yesno  "$( mat2 -s %F )" --title "Metadaten löschen?" && mat2 %U
--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
@@ -2,15 +2,10 @@
 import enum
 import importlib
-from typing import Dict, Optional, Union
+from typing import Dict
 from . import exiftool, video
-# make pyflakes happy
-assert Dict
-assert Optional
-assert Union
 # A set of extension that aren't supported, despite matching a supported mimetype
 UNSUPPORTED_EXTENSIONS = {
    '.asc',
@@ -67,8 +62,9 @@ CMD_DEPENDENCIES = {
    },
 }
 def check_dependencies() -> Dict[str, Dict[str, bool]]:
-    ret = dict()  # type: Dict[str, dict]
+    ret: Dict[str, Dict] = dict()
    for key, value in DEPENDENCIES.items():
        ret[key] = {

--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
 import abc
 import os
 import re
-from typing import Set, Dict, Union
+from typing import Union, Set, Dict
-assert Set  # make pyflakes happy
 class AbstractParser(abc.ABC):
@@ -11,8 +9,8 @@ class AbstractParser(abc.ABC):
    It might yield `ValueError` on instantiation on invalid files,
    and `RuntimeError` when something went wrong in `remove_all`.
    """
-    meta_list = set()  # type: Set[str]
+    meta_list: Set[str] = set()
-    mimetypes = set()  # type: Set[str]
+    mimetypes: Set[str] = set()
    def __init__(self, filename: str) -> None:
        """
@@ -35,8 +33,11 @@ class AbstractParser(abc.ABC):
        self.sandbox = True
    @abc.abstractmethod
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
-        """Return all the metadata of the current file"""
+        """Return all the metadata of the current file
+        :raises RuntimeError: Raised if the cleaning process went wrong.
+        """
    @abc.abstractmethod
    def remove_all(self) -> bool:

--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -7,14 +7,10 @@ import tempfile
 import os
 import logging
 import shutil
-from typing import Dict, Set, Pattern, Union, Any, List
+from typing import Pattern, Union, Any, Set, Dict, List
 from . import abstract, UnknownMemberPolicy, parser_factory
-# Make pyflakes happy
-assert Set
-assert Pattern
 # pylint: disable=not-callable,assignment-from-no-return,too-many-branches
 # An ArchiveClass is a class representing an archive,
@@ -53,15 +49,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
        # Those are the files that have a format that _isn't_
        # supported by mat2, but that we want to keep anyway.
-        self.files_to_keep = set()  # type: Set[Pattern]
+        self.files_to_keep: Set[Pattern] = set()
        # Those are the files that we _do not_ want to keep,
        # no matter if they are supported or not.
-        self.files_to_omit = set()  # type: Set[Pattern]
+        self.files_to_omit: Set[Pattern] = set()
        # what should the parser do if it encounters an unknown file in
        # the archive?
-        self.unknown_member_policy = UnknownMemberPolicy.ABORT  # type: UnknownMemberPolicy
+        self.unknown_member_policy: UnknownMemberPolicy = UnknownMemberPolicy.ABORT
        # The LGTM comment is to mask a false-positive,
        # see https://lgtm.com/projects/g/jvoisin/mat2/
@@ -73,15 +69,22 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
    def _specific_cleanup(self, full_path: str) -> bool:
        """ This method can be used to apply specific treatment
        to files present in the archive."""
-        # pylint: disable=unused-argument,no-self-use
+        # pylint: disable=unused-argument
        return True  # pragma: no cover
    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """ This method can be used to extract specific metadata
        from files present in the archive."""
-        # pylint: disable=unused-argument,no-self-use
+        # pylint: disable=unused-argument
        return {}  # pragma: no cover
+    def _final_checks(self) -> bool:
+        """ This method is invoked after the file has been cleaned,
+        allowing to run final verifications.
+        """
+        # pylint: disable=unused-argument
+        return True
    @staticmethod
    @abc.abstractmethod
    def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
@@ -102,6 +105,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
    def _get_member_name(member: ArchiveMember) -> str:
        """Return the name of the given member."""
+    @staticmethod
+    @abc.abstractmethod
+    def _is_dir(member: ArchiveMember) -> bool:
+        """Return true is the given member is a directory."""
    @abc.abstractmethod
    def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
                             full_path: str):
@@ -113,8 +121,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
        # pylint: disable=unused-argument
        return member
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    @staticmethod
-        meta = dict()  # type: Dict[str, Union[str, dict]]
+    def _get_member_compression(member: ArchiveMember):
+        """Get the compression of the archive member."""
+        # pylint: disable=unused-argument
+        return None
+    @staticmethod
+    def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
+        """Set the compression of the archive member."""
+        # pylint: disable=unused-argument
+        return member
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
+        meta: Dict[str, Union[str, Dict]] = dict()
        with self.archive_class(self.filename) as zin:
            temp_folder = tempfile.mkdtemp()
@@ -123,12 +143,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                local_meta = self._get_member_meta(item)
                member_name = self._get_member_name(item)
-                if member_name[-1] == '/':  # pragma: no cover
+                if self._is_dir(item):  # pragma: no cover
-                    # `is_dir` is added in Python3.6
                    continue  # don't keep empty folders
-                zin.extract(member=item, path=temp_folder)
                full_path = os.path.join(temp_folder, member_name)
+                if not os.path.abspath(full_path).startswith(temp_folder):
+                    logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
+                        self.filename, member_name, full_path)
+                    break
+                try:
+                    zin.extract(member=item, path=temp_folder)
+                except OSError as e:
+                    logging.error("Unable to extraxt %s from %s: %s", item, self.filename, e)
                os.chmod(full_path, stat.S_IRUSR)
                specific_meta = self._specific_get_meta(full_path, member_name)
@@ -136,6 +164,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                member_parser, _ = parser_factory.get_parser(full_path)  # type: ignore
                if member_parser:
+                    member_parser.sandbox = self.sandbox
                    local_meta = {**local_meta, **member_parser.get_meta()}
                if local_meta:
@@ -155,12 +184,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
            # Sort the items to process, to reduce fingerprinting,
            # and keep them in the `items` variable.
-            items = list()  # type: List[ArchiveMember]
+            items: List[ArchiveMember] = list()
            for item in sorted(self._get_all_members(zin), key=self._get_member_name):
                # Some fileformats do require to have the `mimetype` file
                # as the first file in the archive.
                if self._get_member_name(item) == 'mimetype':
-                    items = [item] + items
+                    items.insert(0, item)
                else:
                    items.append(item)
@@ -168,18 +197,36 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
            # we're iterating (and thus inserting) them in lexicographic order.
            for item in items:
                member_name = self._get_member_name(item)
-                if member_name[-1] == '/':  # `is_dir` is added in Python3.6
+                if self._is_dir(item):
                    continue  # don't keep empty folders
-                zin.extract(member=item, path=temp_folder)
                full_path = os.path.join(temp_folder, member_name)
+                if not os.path.abspath(full_path).startswith(temp_folder):
+                    logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
+                            self.filename, member_name, full_path)
+                    abort = True
+                    break
+                zin.extract(member=item, path=temp_folder)
+                try:
                    original_permissions = os.stat(full_path).st_mode
+                except FileNotFoundError:
+                    logging.error("Something went wrong during processing of "
+                            "%s in %s, likely a path traversal attack.",
+                            member_name, self.filename)
+                    abort = True
+                    # we're breaking instead of continuing, because this exception
+                    # is raised in case of weird path-traversal-like atttacks.
+                    break
                os.chmod(full_path, original_permissions | stat.S_IWUSR | stat.S_IRUSR)
+                original_compression = self._get_member_compression(item)
                if self._specific_cleanup(full_path) is False:
-                    logging.warning("Something went wrong during deep cleaning of %s",
+                    logging.warning("Something went wrong during deep cleaning of %s in %s",
-                                    member_name)
+                                    member_name, self.filename)
                    abort = True
                    continue
@@ -205,6 +252,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                            abort = True
                            continue
                    else:
+                        member_parser.sandbox = self.sandbox
                        if member_parser.remove_all() is False:
                            logging.warning("In file %s, something went wrong \
                                             with the cleaning of %s \
@@ -216,6 +264,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                zinfo = self.member_class(member_name)  # type: ignore
                zinfo = self._set_member_permissions(zinfo, original_permissions)
+                zinfo = self._set_member_compression(zinfo, original_compression)
                clean_zinfo = self._clean_member(zinfo)
                self._add_file_to_archive(zout, clean_zinfo, full_path)
@@ -223,11 +272,14 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
        if abort:
            os.remove(self.output_filename)
            return False
+        if not self._final_checks():
+            return False  # pragma: no cover
        return True
 class TarParser(ArchiveBasedAbstractParser):
    mimetypes = {'application/x-tar'}
    def __init__(self, filename):
        super().__init__(filename)
        # yes, it's tarfile.open and not tarfile.TarFile,
@@ -337,6 +389,11 @@ class TarParser(ArchiveBasedAbstractParser):
        member.mode = permissions
        return member
+    @staticmethod
+    def _is_dir(member: ArchiveMember) -> bool:
+        assert isinstance(member, tarfile.TarInfo)  # please mypy
+        return member.isdir()
 class TarGzParser(TarParser):
    compression = ':gz'
@@ -355,16 +412,17 @@ class TarXzParser(TarParser):
 class ZipParser(ArchiveBasedAbstractParser):
    mimetypes = {'application/zip'}
-    def __init__(self, filename):
+    def __init__(self, filename: str):
        super().__init__(filename)
        self.archive_class = zipfile.ZipFile
        self.member_class = zipfile.ZipInfo
-        self.zip_compression_type = zipfile.ZIP_DEFLATED
    def is_archive_valid(self):
        try:
-            zipfile.ZipFile(self.filename)
+            with zipfile.ZipFile(self.filename):
-        except zipfile.BadZipFile:
+                pass
+        except (zipfile.BadZipFile, OSError):
            raise ValueError
    @staticmethod
@@ -400,7 +458,7 @@ class ZipParser(ArchiveBasedAbstractParser):
        assert isinstance(member, zipfile.ZipInfo)  # please mypy
        with open(full_path, 'rb') as f:
            archive.writestr(member, f.read(),
-                             compress_type=self.zip_compression_type)
+                             compress_type=member.compress_type)
    @staticmethod
    def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
@@ -411,3 +469,19 @@ class ZipParser(ArchiveBasedAbstractParser):
    def _get_member_name(member: ArchiveMember) -> str:
        assert isinstance(member, zipfile.ZipInfo)  # please mypy
        return member.filename
+    @staticmethod
+    def _get_member_compression(member: ArchiveMember):
+        assert isinstance(member, zipfile.ZipInfo)  # please mypy
+        return member.compress_type
+    @staticmethod
+    def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
+        assert isinstance(member, zipfile.ZipInfo)  # please mypy
+        member.compress_type = compression
+        return member
+    @staticmethod
+    def _is_dir(member: ArchiveMember) -> bool:
+        assert isinstance(member, zipfile.ZipInfo)  # please mypy
+        return member.is_dir()
--- a/libmat2/audio.py
+++ b/libmat2/audio.py
@@ -2,42 +2,51 @@ import mimetypes
 import os
 import shutil
 import tempfile
-from typing import Dict, Union
+from typing import Union, Dict
 import mutagen
-from . import abstract, parser_factory
+from . import abstract, parser_factory, video
 class MutagenParser(abstract.AbstractParser):
    def __init__(self, filename):
        super().__init__(filename)
        try:
-            mutagen.File(self.filename)
+            if mutagen.File(self.filename) is None:
+                raise ValueError
        except mutagen.MutagenError:
            raise ValueError
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        f = mutagen.File(self.filename)
        if f.tags:
-            return {k:', '.join(v) for k, v in f.tags.items()}
+            return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
        return {}
    def remove_all(self) -> bool:
        shutil.copy(self.filename, self.output_filename)
        f = mutagen.File(self.output_filename)
+        try:
            f.delete()
            f.save()
+        except mutagen.MutagenError:
+            raise ValueError
        return True
 class MP3Parser(MutagenParser):
    mimetypes = {'audio/mpeg', }
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
-        metadata = {}  # type: Dict[str, Union[str, dict]]
+        metadata: Dict[str, Union[str, Dict]] = dict()
        meta = mutagen.File(self.filename).tags
+        if not meta:
+            return metadata
        for key in meta:
+            if isinstance(key, tuple):
+                metadata[key[0]] = key[1]
+                continue
            if not hasattr(meta[key], 'text'):  # pragma: no cover
                continue
            metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
@@ -59,7 +68,7 @@ class FLACParser(MutagenParser):
        f.save(deleteid3=True)
        return True
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        meta = super().get_meta()
        for num, picture in enumerate(mutagen.File(self.filename).pictures):
            name = picture.desc if picture.desc else 'Cover %d' % num
@@ -73,7 +82,33 @@ class FLACParser(MutagenParser):
            with open(fname, 'wb') as f:
                f.write(picture.data)
            p, _ = parser_factory.get_parser(fname)  # type: ignore
+            if p is None:
+                raise ValueError
+            p.sandbox = self.sandbox
            # Mypy chokes on ternaries :/
            meta[name] = p.get_meta() if p else 'harmful data'  # type: ignore
            os.remove(fname)
        return meta
+class WAVParser(video.AbstractFFmpegParser):
+    mimetypes = {'audio/x-wav', }
+    meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
+                      'Duration', 'Encoding', 'ExifToolVersion',
+                      'FileAccessDate', 'FileInodeChangeDate',
+                      'FileModifyDate', 'FileName', 'FilePermissions',
+                      'FileSize', 'FileType', 'FileTypeExtension',
+                      'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
+                     }
+class AIFFParser(video.AbstractFFmpegParser):
+    mimetypes = {'audio/aiff', 'audio/x-aiff'}
+    meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
+                      'Duration', 'Encoding', 'ExifToolVersion',
+                      'FileAccessDate', 'FileInodeChangeDate',
+                      'FileModifyDate', 'FileName', 'FilePermissions',
+                      'FileSize', 'FileType', 'FileTypeExtension',
+                      'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
+                      'NumSampleFrames', 'SampleSize',
+                     }
--- a/libmat2/bubblewrap.py
+++ b/libmat2/bubblewrap.py
@@ -11,7 +11,8 @@ import os
 import shutil
 import subprocess
 import tempfile
-from typing import List, Optional
+import functools
+from typing import Optional, List
 __all__ = ['PIPE', 'run', 'CalledProcessError']
@@ -21,16 +22,15 @@ CalledProcessError = subprocess.CalledProcessError
 # pylint: disable=subprocess-run-check
+@functools.lru_cache(maxsize=None)
 def _get_bwrap_path() -> str:
-    bwrap_path = '/usr/bin/bwrap'
+    which_path = shutil.which('bwrap')
-    if os.path.isfile(bwrap_path):
+    if which_path:
-        if os.access(bwrap_path, os.X_OK):
+        return which_path
-            return bwrap_path
    raise RuntimeError("Unable to find bwrap")  # pragma: no cover
-# pylint: disable=bad-whitespace
 def _get_bwrap_args(tempdir: str,
                    input_filename: str,
                    output_filename: Optional[str] = None) -> List[str]:
@@ -39,7 +39,7 @@ def _get_bwrap_args(tempdir: str,
    # XXX: use --ro-bind-try once all supported platforms
    # have a bubblewrap recent enough to support it.
-    ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', cwd]
+    ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', '/etc/alternatives', cwd]
    for bind_dir in ro_bind_dirs:
        if os.path.isdir(bind_dir):  # pragma: no cover
            ro_bind_args.extend(['--ro-bind', bind_dir, bind_dir])
@@ -78,7 +78,6 @@ def _get_bwrap_args(tempdir: str,
    return args
-# pylint: disable=bad-whitespace
 def run(args: List[str],
        input_filename: str,
        output_filename: Optional[str] = None,

--- a/libmat2/epub.py
+++ b/libmat2/epub.py
 import logging
 import re
 import uuid
+import zipfile
 import xml.etree.ElementTree as ET  # type: ignore
+from typing import Any, Dict
 from . import archive, office
 class EPUBParser(archive.ZipParser):
    mimetypes = {'application/epub+zip', }
    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
@@ -15,11 +18,27 @@ class EPUBParser(archive.ZipParser):
            'META-INF/container.xml',
            'mimetype',
            'OEBPS/content.opf',
+            'content.opf',
+            'hmh.opf',
+            'OPS/.+.xml'
+            }))
+        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            'iTunesMetadata.plist',
+            'META-INF/calibre_bookmarks.txt',
+            'OEBPS/package.opf',
             }))
        self.uniqid = uuid.uuid4()
-    def _specific_get_meta(self, full_path, file_path):
+    def is_archive_valid(self):
-        if file_path != 'OEBPS/content.opf':
+        super().is_archive_valid()
+        with zipfile.ZipFile(self.filename) as zin:
+            for item in self._get_all_members(zin):
+                member_name = self._get_member_name(item)
+                if member_name.endswith('META-INF/encryption.xml'):
+                    raise ValueError('the file contains encrypted fonts')
+    def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
+        if not file_path.endswith('.opf'):
            return {}
        with open(full_path, encoding='utf-8') as f:
@@ -30,14 +49,31 @@ class EPUBParser(archive.ZipParser):
            except (TypeError, UnicodeDecodeError):
                return {file_path: 'harmful content', }
-    def _specific_cleanup(self, full_path: str):
+    def _specific_cleanup(self, full_path: str) -> bool:
-        if full_path.endswith('OEBPS/content.opf'):
+        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
            return self.__handle_contentopf(full_path)
        elif full_path.endswith('OEBPS/toc.ncx'):
            return self.__handle_tocncx(full_path)
+        elif re.search('/OPS/[^/]+.xml$', full_path):
+            return self.__handle_ops_xml(full_path)
+        return True
+    def __handle_ops_xml(self, full_path: str) -> bool:
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
        return True
-    def __handle_tocncx(self, full_path: str):
+    def __handle_tocncx(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:  # pragma: nocover
@@ -53,7 +89,7 @@ class EPUBParser(archive.ZipParser):
                   short_empty_elements=False)
        return True
-    def __handle_contentopf(self, full_path: str):
+    def __handle_contentopf(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
@@ -71,7 +107,7 @@ class EPUBParser(archive.ZipParser):
                item.append(uniqid)
                # items without mandatory content
-                for name in {'language', 'title'}:
+                for name in ['language', 'title']:
                    uniqid = ET.Element(self.metadata_namespace + name)
                    item.append(uniqid)
                break  # there is only a single <metadata> block

--- a/libmat2/exiftool.py
+++ b/libmat2/exiftool.py
@@ -2,24 +2,22 @@ import functools
 import json
 import logging
 import os
+import shutil
 import subprocess
-from typing import Dict, Union, Set
+from typing import Union, Set, Dict
 from . import abstract
 from . import bubblewrap
-# Make pyflakes happy
-assert Set
 class ExiftoolParser(abstract.AbstractParser):
    """ Exiftool is often the easiest way to get all the metadata
    from a import file, hence why several parsers are re-using its `get_meta`
    method.
    """
-    meta_allowlist = set()  # type: Set[str]
+    meta_allowlist: Set[str] = set()
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        try:
            if self.sandbox:
                out = bubblewrap.run([_get_exiftool_path(), '-json',
@@ -69,16 +67,14 @@ class ExiftoolParser(abstract.AbstractParser):
            return False
        return True
-@functools.lru_cache()
+@functools.lru_cache(maxsize=None)
 def _get_exiftool_path() -> str:  # pragma: no cover
-    possible_pathes = {
+    which_path = shutil.which('exiftool')
-        '/usr/bin/exiftool',              # debian/fedora
+    if which_path:
-        '/usr/bin/vendor_perl/exiftool',  # archlinux
+        return which_path
-    }
-    for possible_path in possible_pathes:
+    # Exiftool on Arch Linux has a weird path
-        if os.path.isfile(possible_path):
+    if os.access('/usr/bin/vendor_perl/exiftool', os.X_OK):
-            if os.access(possible_path, os.X_OK):
+        return '/usr/bin/vendor_perl/exiftool'
-                return possible_path
    raise RuntimeError("Unable to find exiftool")
--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
 import shutil
-from typing import Dict, Union
+from typing import Union, Dict
 from . import abstract
 class HarmlessParser(abstract.AbstractParser):
    """ This is the parser for filetypes that can not contain metadata. """
-    mimetypes = {'text/plain', 'image/x-ms-bmp'}
+    mimetypes = {'text/plain', 'image/x-ms-bmp', 'image/bmp'}
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        return dict()
    def remove_all(self) -> bool:

--- a/libmat2/images.py
+++ b/libmat2/images.py
-import imghdr
 import os
 import re
-from typing import Set, Dict, Union, Any
+from typing import Union, Any, Dict
 import cairo
@@ -12,10 +11,6 @@ from gi.repository import GdkPixbuf, GLib, Rsvg
 from . import exiftool, abstract
-# Make pyflakes happy
-assert Set
-assert Any
 class SVGParser(exiftool.ExiftoolParser):
    mimetypes = {'image/svg+xml', }
    meta_allowlist = {'Directory', 'ExifToolVersion', 'FileAccessDate',
@@ -26,25 +21,40 @@ class SVGParser(exiftool.ExiftoolParser):
                      }
    def remove_all(self) -> bool:
+        try:
            svg = Rsvg.Handle.new_from_file(self.filename)
+        except GLib.GError:
+            raise ValueError
+        try:
+            _, _, _, _, has_viewbox, viewbox = svg.get_intrinsic_dimensions()
+            if has_viewbox is False:
+                raise ValueError
+            _, width, height = svg.get_intrinsic_size_in_pixels()
+        except AttributeError:
            dimensions = svg.get_dimensions()
-        surface = cairo.SVGSurface(self.output_filename,
+            height, width = dimensions.height, dimensions.width
-                                   dimensions.height,
-                                   dimensions.width)
+        surface = cairo.SVGSurface(self.output_filename, height, width)
        context = cairo.Context(surface)
+        try:
+            svg.render_document(context, viewbox)
+        except AttributeError:
            svg.render_cairo(context)
        surface.finish()
        return True
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        meta = super().get_meta()
        # The namespace is mandatory, but only the …/2000/svg is valid.
        ns = 'http://www.w3.org/2000/svg'
-        if meta.get('Xmlns', ns) == ns:
+        if meta.get('Xmlns') == ns:
            meta.pop('Xmlns')
        return meta
 class PNGParser(exiftool.ExiftoolParser):
    mimetypes = {'image/png', }
    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
@@ -58,12 +68,9 @@ class PNGParser(exiftool.ExiftoolParser):
    def __init__(self, filename):
        super().__init__(filename)
-        if imghdr.what(filename) != 'png':
-            raise ValueError
        try:  # better fail here than later
            cairo.ImageSurface.create_from_png(self.filename)
-        except Exception:  # pragma: no cover
+        except:  # pragma: no cover
            # Cairo is returning some weird exceptions :/
            raise ValueError
@@ -98,7 +105,6 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
    def __init__(self, filename):
        super().__init__(filename)
-        # we can't use imghdr here because of https://bugs.python.org/issue28591
        try:
            GdkPixbuf.Pixbuf.new_from_file(self.filename)
        except GLib.GError:
@@ -110,6 +116,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
        _, extension = os.path.splitext(self.filename)
        pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
+        pixbuf = GdkPixbuf.Pixbuf.apply_embedded_orientation(pixbuf)
        if extension.lower() == '.jpg':
            extension = '.jpeg'  # gdk is picky
        elif extension.lower() == '.tif':
@@ -132,7 +139,7 @@ class JPGParser(GdkPixbufAbstractParser):
                      'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
                      'ColorComponents', 'EncodingProcess', 'JFIFVersion',
                      'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
-                      'YResolution', 'Megapixels', 'ImageHeight'}
+                      'YResolution', 'Megapixels', 'ImageHeight', 'Orientation'}
 class TiffParser(GdkPixbufAbstractParser):
@@ -146,13 +153,14 @@ class TiffParser(GdkPixbufAbstractParser):
                      'FileInodeChangeDate', 'FileModifyDate', 'FileName',
                      'FilePermissions', 'FileSize', 'FileType',
                      'FileTypeExtension', 'ImageHeight', 'ImageSize',
-                      'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
+                      'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile', 'Orientation'}
 class PPMParser(abstract.AbstractParser):
    mimetypes = {'image/x-portable-pixmap'}
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
-        meta = {}  # type: Dict[str, Union[str, Dict[Any, Any]]]
+        meta: Dict[str, Union[str, Dict[Any, Any]]] = dict()
        with open(self.filename) as f:
            for idx, line in enumerate(f):
                if line.lstrip().startswith('#'):
@@ -167,3 +175,36 @@ class PPMParser(abstract.AbstractParser):
                        line = re.sub(r"\s+", "", line, flags=re.UNICODE)
                        fout.write(line)
        return True
+class HEICParser(exiftool.ExiftoolParser):
+    mimetypes = {'image/heic'}
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
+            'FileSize', 'FileModifyDate', 'FileAccessDate',
+            'FileInodeChangeDate', 'FilePermissions', 'FileType',
+            'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion',
+            'CompatibleBrands','HandlerType', 'PrimaryItemReference',
+            'HEVCConfigurationVersion', 'GeneralProfileSpace',
+            'GeneralTierFlag', 'GeneralProfileIDC',
+            'GenProfileCompatibilityFlags', 'ConstraintIndicatorFlags',
+            'GeneralLevelIDC', 'MinSpatialSegmentationIDC',
+            'ParallelismType','ChromaFormat', 'BitDepthLuma', 'BitDepthChroma',
+            'NumTemporalLayers', 'TemporalIDNested', 'ImageWidth',
+            'ImageHeight', 'ImageSpatialExtent', 'ImagePixelDepth',
+            'AverageFrameRate', 'ConstantFrameRate', 'MediaDataSize',
+            'MediaDataOffset','ImageSize', 'Megapixels'}
+    def remove_all(self) -> bool:
+        return self._lightweight_cleanup()
+class WEBPParser(GdkPixbufAbstractParser):
+    mimetypes = {'image/webp'}
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
+                      'Directory', 'FileSize', 'FileModifyDate',
+                      'FileAccessDate', "FileInodeChangeDate",
+                      'FilePermissions', 'FileType', 'FileTypeExtension',
+                      'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
+                      'ColorComponents', 'EncodingProcess', 'JFIFVersion',
+                      'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
+                      'YResolution', 'Megapixels', 'ImageHeight', 'Orientation',
+                      'HorizontalScale', 'VerticalScale', 'VP8Version'}
--- a/libmat2/office.py
+++ b/libmat2/office.py
--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
-import logging
 import glob
 import os
 import mimetypes
 import importlib
-from typing import TypeVar, List, Tuple, Optional
+from typing import TypeVar, Optional, List, Tuple
 from . import abstract, UNSUPPORTED_EXTENSIONS
@@ -12,6 +11,10 @@ T = TypeVar('T', bound='abstract.AbstractParser')
 mimetypes.add_type('application/epub+zip', '.epub')
 mimetypes.add_type('application/x-dtbncx+xml', '.ncx')  # EPUB Navigation Control XML File
+# This should be removed after we move to python3.10
+# https://github.com/python/cpython/commit/20a5b7e986377bdfd929d7e8c4e3db5847dfdb2d
+mimetypes.add_type('image/heic', '.heic')
 def __load_all_parsers():
    """ Loads every parser in a dynamic way """
@@ -40,7 +43,10 @@ def _get_parsers() -> List[T]:
 def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
-    """ Return the appropriate parser for a given filename. """
+    """ Return the appropriate parser for a given filename.
+        :raises ValueError: Raised if the instantiation of the parser went wrong.
+    """
    mtype, _ = mimetypes.guess_type(filename)
    _, extension = os.path.splitext(filename)
@@ -53,10 +59,6 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
    for parser_class in _get_parsers():  # type: ignore
        if mtype in parser_class.mimetypes:
-            try:
+            # This instantiation might raise a ValueError on malformed files
            return parser_class(filename), mtype
-            except ValueError as e:
-                logging.info("Got an exception when trying to instantiate "
-                             "%s for %s: %s", parser_class, filename, e)
-                return None, mtype
    return None, mtype
--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -7,8 +7,7 @@ import re
 import logging
 import tempfile
 import io
-from typing import Dict, Union
+from typing import Union, Dict
-from distutils.version import LooseVersion
 import cairo
 import gi
@@ -17,10 +16,7 @@ from gi.repository import Poppler, GLib
 from . import abstract
-poppler_version = Poppler.get_version()
+FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5
-if LooseVersion(poppler_version) < LooseVersion('0.46'):  # pragma: no cover
-    raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
-The installed version is %s." % poppler_version)  # pragma: no cover
 class PDFParser(abstract.AbstractParser):
@@ -32,7 +28,7 @@ class PDFParser(abstract.AbstractParser):
    def __init__(self, filename):
        super().__init__(filename)
        self.uri = 'file://' + os.path.abspath(self.filename)
-        self.__scale = 2  # how much precision do we want for the render
+        self.__scale = 200 / 72.0  # how much precision do we want for the render
        try:  # Check now that the file is valid, to avoid surprises later
            Poppler.Document.new_from_file(self.uri, None)
        except GLib.GError:  # Invalid PDF
@@ -40,7 +36,10 @@ class PDFParser(abstract.AbstractParser):
    def remove_all(self) -> bool:
        if self.lightweight_cleaning is True:
+            try:
                return self.__remove_all_lightweight()
+            except (cairo.Error, MemoryError) as e:
+                raise RuntimeError(e)
        return self.__remove_all_thorough()
    def __remove_all_lightweight(self) -> bool:
@@ -52,6 +51,7 @@ class PDFParser(abstract.AbstractParser):
        tmp_path = tempfile.mkstemp()[1]
        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)  # resized later anyway
+        pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface
        for pagenum in range(pages_count):
@@ -80,15 +80,19 @@ class PDFParser(abstract.AbstractParser):
        _, tmp_path = tempfile.mkstemp()
        pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
+        pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
        pdf_context = cairo.Context(pdf_surface)
        for pagenum in range(pages_count):
            page = document.get_page(pagenum)
+            if page is None:  # pragma: no cover
+                logging.error("Unable to get PDF pages")
+                return False
            page_width, page_height = page.get_size()
            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
-            width = int(page_width) * self.__scale
+            width = int(page_width * self.__scale)
-            height = int(page_height) * self.__scale
+            height = int(page_height * self.__scale)
            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
            img_context = cairo.Context(img_surface)
@@ -102,7 +106,11 @@ class PDFParser(abstract.AbstractParser):
            buf.seek(0)
            img = cairo.ImageSurface.create_from_png(buf)
-            pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
+            if cairo.version_info < (1, 12, 0):
+                pdf_surface.set_size(width, height)
+            else:
+                pdf_surface.set_size(page_width, page_height)
+                pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
            pdf_context.set_source_surface(img, 0, 0)
            pdf_context.paint()
            pdf_context.show_page()  # draw pdf_context on pdf_surface
@@ -122,6 +130,17 @@ class PDFParser(abstract.AbstractParser):
        document.set_creator('')
        document.set_creation_date(-1)
        document.save('file://' + os.path.abspath(out_file))
+        # Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
+        # fails to remove them, we have to use this terrible regex.
+        # It should(tm) be alright though, because cairo's output format
+        # for metadata is fixed.
+        with open(out_file, 'rb') as f:
+            out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(),
+                         count=0, flags=re.DOTALL | re.IGNORECASE)
+        with open(out_file, 'wb') as f:
+            f.write(out)
        return True
    @staticmethod
@@ -131,7 +150,7 @@ class PDFParser(abstract.AbstractParser):
            metadata[key] = value
        return metadata
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        """ Return a dict with all the meta of the file
        """
        metadata = {}
No results found