Compare revisions

77dde8a0 · b3def8b5 · 148bcbba · 62ec8f6c · 8bea9891 · 88b7ec2c
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 variables:
  CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
+  GIT_DEPTH: "5"
+  GIT_STRATEGY: clone

 stages:
  - linting
@@ -9,43 +11,23 @@ stages:
  before_script:  # This is needed to not run the testsuite as root
    - useradd --home-dir ${CI_PROJECT_DIR} mat2
    - chown -R mat2 .
-
-linting:bandit:
-  image: $CONTAINER_REGISTRY:linting 
-  stage: linting
-  script:  # TODO: remove B405 and B314
-    - bandit ./mat2 --format txt --skip B101
-    - bandit -r ./nautilus/ --format txt --skip B101
-    - bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108,B311
-
-linting:codespell:
-  image: $CONTAINER_REGISTRY:linting
-  stage: linting
-  script:
-    # Run codespell to check for spelling errors; ignore errors about binary
-    # files, use a config with ignored words and exclude the git directory,
-    # which might contain false positives
-    - codespell -q 2 -I utils/ci/codespell/ignored_words.txt -S .git
  
-linting:pylint:
+linting:ruff:
  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-    - pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
-    # Once nautilus-python is in Debian, decomment it form the line below
-    - pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
-
-linting:pyflakes:
-  image: $CONTAINER_REGISTRY:linting
-  stage: linting
-  script:
-    - pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus
+    - apt update
+    - apt install -qqy --no-install-recommends python3-venv
+    - python3 -m venv venv
+    - source venv/bin/activate
+    - pip3 install ruff
+    - ruff check .

 linting:mypy:
  image: $CONTAINER_REGISTRY:linting
  stage: linting
  script:
-    - mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py
+    - mypy --ignore-missing-imports mat2 libmat2/*.py

 tests:archlinux:
  image: $CONTAINER_REGISTRY:archlinux
@@ -56,17 +38,20 @@ tests:archlinux:
 tests:debian:
  image: $CONTAINER_REGISTRY:debian
  stage: test
+  <<: *prepare_env
  script:
    - apt-get -qqy purge bubblewrap
-    - python3 -m unittest discover -v
+    - su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
+    - su - mat2 -c "python3-coverage report --fail-under=95 -m --include 'libmat2/*'"

 tests:debian_with_bubblewrap:
  image: $CONTAINER_REGISTRY:debian
  stage: test
+  allow_failure: true
  <<: *prepare_env
  script:
-    - su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
-    - su - mat2 -c "python3-coverage report --fail-under=100 -m --include 'libmat2/*'"
+    - apt-get -qqy install bubblewrap
+    - python3 -m unittest discover -v

 tests:fedora:
  image: $CONTAINER_REGISTRY:fedora
@@ -80,3 +65,51 @@ tests:gentoo:
  <<: *prepare_env
  script:
    - su - mat2 -c "python3 -m unittest discover -v"
+
+tests:python3.7:
+  image: $CONTAINER_REGISTRY:python3.7
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.8:
+  image: $CONTAINER_REGISTRY:python3.8
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.9:
+  image: $CONTAINER_REGISTRY:python3.9
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.10:
+  image: $CONTAINER_REGISTRY:python3.10
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.11:
+  image: $CONTAINER_REGISTRY:python3.11
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.12:
+  image: $CONTAINER_REGISTRY:python3.12
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.13:
+  image: $CONTAINER_REGISTRY:python3.13
+  stage: test
+  script:
+    - python3 -m unittest discover -v
+
+tests:python3.14:
+  image: $CONTAINER_REGISTRY:python3.14
+  stage: test
+  script:
+    - python3 -m unittest discover -v
--- a/.pylintrc
+++ b/.pylintrc
@@ -14,4 +14,5 @@ disable=
    catching-non-exception,
    cell-var-from-loop,
    locally-disabled,
+		raise-missing-from,
    invalid-sequence-index,  # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 0.13.5 - 2025-01-09
+- Keep orientation metadata on jpeg and tiff files
+- Improve cairo-related error/exceptions handling
+- Improve the logging
+- Improve the sandboxing
+- Improve Python3.12 support
+- Improve MSOffice documents handling
+
+# 0.13.4 - 2023-08-02
+
+- Add documentation about mat2 on OSX
+- Make use of python3.7 constructs to simplify code
+- Use moderner type annotations
+- Harden get_meta in archive.py against variants of CVE-2022-35410 
+- Improve MSOffice document support
+- Package the manpage on pypi 
+
+# 0.13.3 - 2023-02-23
+
+- Fix a decorator argument
+
+# 0.13.2 - 2023-01-28
+
+- Fix a crash on some python versions
+
+# 0.13.1 - 2023-01-07
+
+- Improve xlsx support
+- Remove the Nautilus extension
+
+# 0.13.0 - 2022-07-06
+
+- Fix an arbitrary file read (CVE-2022-35410)
+- Add support for heic files 
+
+# 0.12.4 - 2022-04-30
+
+- Fix possible errors/crashes when processing multiple files
+  via the command line interface
+- Use a fixed PDF version for the output
+- Improve compatibility with modern versions of rsvg
+- Improve the robustness of the command line interface with
+  regard to control characters
+
+# 0.12.3 - 2022-01-06
+
+- Implement code for internationalization
+- Keep individual files compression type in zip files
+- Increase the robustness of mat2 against weird/corrupted files
+- Fix the dolphin integration
+- Add a fuzzer
+
+# 0.12.2 - 2021-08-29
+
+- Add support for aiff files
+- Improve MS Office support
+- Improve compatibility with newer/older version of mat2's dependencies
+- Fix possible issues with the resolution of processed pdf
+
+# 0.12.1 - 2021-03-19
+
+- Improve epub support
+- Improve MS Office support
+
+# 0.12.0 - 2020-12-18
+
+- Improve significantly MS Office formats support
+- Fix some typos in the Nautilus extension
+- Improve reliability of the mp3, pdf and svg parsers
+- Improve compatibility with ffmpeg when sandboxing is used
+- Improve the dolphin extension usability
+- libmat2 now raises a ValueError on malformed files while trying to 
+  find the right parser, instead of returning None
+
 # 0.11.0 - 2020-03-29

 - Improve significantly MS Office formats support

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,8 +4,14 @@ The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
 but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.

 Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
-and to send a pull-request. Please do check that everything is fine by running the
-testsuite with `python3 -m unittest discover -v` before submitting one :)
+and to send a pull-request.
+
+Before sending the pull-request, please do check that everything is fine by
+running the full test suite in GitLab. To do that, after forking mat2 in GitLab,
+you need to go in Settings -> CI/CD -> Runner and there enable shared runners.
+
+Mat2 also has unit tests (that are also run in the full test suite). You can run
+them with `python3 -m unittest discover -v`.

 If you're fixing a bug or adding a new feature, please add tests accordingly,
 this will greatly improve the odds of your merge-request getting merged.
@@ -24,15 +30,16 @@ Since mat2 is written in Python3, please conform as much as possible to the
 1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
 2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
 3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
-4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
-5. Commit the changelog, man page, mat2 and setup.py files
-6. Create a tag with `git tag -s $VERSION`
-7. Push the commit with `git push origin master`
-8. Push the tag with `git push --tags`
-9. Download the gitlab archive of the release
-10. Diff it against the local copy
-11. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
-12. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
-13. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
-14. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
-15. Do the secret release dance
+4. Update the version in the [pyproject.toml](https://0xacab.org/jvoisin/mat2/blob/master/yproject.toml) file
+5. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
+6. Commit the modified files
+7. Create a tag with `git tag -s $VERSION`
+8. Push the commit with `git push origin master`
+9. Push the tag with `git push --tags`
+10. Download the gitlab archive of the release
+11. Diff it against the local copy
+12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
+13. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
+14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
+15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
+16. Do the secret release dance
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -18,34 +18,53 @@ installed, mat2 uses it to sandbox any external processes it invokes.

 ## Arch Linux

-Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on
-[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/).
+Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
+[Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).

 ## Debian

-There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all).
+There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all) and you can install mat2 with:
+
+```
+apt install mat2
+```

 ## Fedora

 Thanks to [atenart](https://ack.tf/), there is a package available on
 [Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).

-We use copr (cool other packages repo) as the Mat2 Nautilus plugin depends on
-python3-nautilus, which isn't available yet in Fedora (but is distributed
-through this copr).
-
-First you need to enable Mat2's copr:
+First you need to enable mat2's copr:

 ```
 dnf -y copr enable atenart/mat2
 ```

-Then you can install both the Mat2 command and Nautilus extension:
+Then you can install mat2:

 ```
-dnf -y install mat2 mat2-nautilus
+dnf -y install mat2
 ```

 ## Gentoo

 mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
+
+
+# OSX
+
+## Homebrew
+
+mat2 is [available on homebrew](https://formulae.brew.sh/formula/mat2):
+
+```
+brew install mat2
+```
+
+## MacPorts
+
+mat2 is [available on MacPorts](https://ports.macports.org/port/mat2/):
+
+```
+port install mat2
+```
--- a/README.md
+++ b/README.md
 ```
 _____ _____ _____ ___
 |     |  _  |_   _|_  |  Keep your data,
-| | | |     | | | |  _|     trash your meta!
-|_|_|_|__|__| |_| |___|
+| | | | |_| | | | |  _|     trash your meta!
+|_|_|_|_| |_| |_| |___|

 ```

-This software is currently in **beta**, please don't use it for anything
-critical.
-
 # Metadata and privacy

 Metadata consist of information that characterizes data.
@@ -25,9 +22,14 @@ Maybe you don't want to disclose those information.
 This is precisely the job of mat2: getting rid, as much as possible, of
 metadata.

-mat2 provides a command line tool, and graphical user interfaces via a service
-menu for Dolphin, the default file manager of KDE, and an extension for
-Nautilus, the default file manager of GNOME.
+mat2 provides:
+- a library called `libmat2`;
+- a command line tool called `mat2`,
+- a service menu for Dolphin, KDE's default file manager
+
+If you prefer a regular graphical user interface, you might be interested in
+[Metadata Cleaner](https://metadatacleaner.romainvigier.fr/), which is using
+`mat2` under the hood.

 # Requirements

@@ -93,6 +95,19 @@ Note that mat2 **will not** clean files in-place, but will produce, for
 example, with a file named "myfile.png" a cleaned version named
 "myfile.cleaned.png".

+## Web interface
+
+It's possible to run mat2 as a web service, via
+[mat2-web](https://0xacab.org/jvoisin/mat2-web).
+
+If you're using WordPress, you might be interested in [wp-mat](https://git.autistici.org/noblogs/wp-mat)
+and [wp-mat-server](https://git.autistici.org/noblogs/wp-mat-server).
+
+## Desktop GUI
+
+For GNU/Linux desktops, it's possible to use the
+[Metadata Cleaner](https://gitlab.com/rmnvgr/metadata-cleaner) GTK application.
+
 # Supported formats

 The following formats are supported: avi, bmp, css, epub/ncx, flac, gif, jpeg,
@@ -129,11 +144,13 @@ of the guarantee that mat2 won't modify the data of their files, there is the
 	watermarks from PDF.
 - [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
 	an open-source Android application to remove metadata from pictures.
+- [Dangerzone](https://dangerzone.rocks/), designed to sanitize harmful documents
+  into harmless ones.

 # Contact

 If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
-or the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
+or the [mailing list](https://www.autistici.org/mailman/listinfo/mat-dev)
 Should a more private contact be needed (eg. for reporting security issues),
 you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
 using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
@@ -174,4 +191,3 @@ mat2 wouldn't exist without:
 - friends

 Many thanks to them!
-
--- a/doc/mat2.1
+++ b/doc/mat2.1
-.TH mat2 "1" "March 2020" "mat2 0.11.0" "User Commands"
+.TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"

 .SH NAME
 mat2 \- the metadata anonymisation toolkit 2

--- a/dolphin/mat2.desktop
+++ b/dolphin/mat2.desktop
 [Desktop Entry]
 X-KDE-ServiceTypes=KonqPopupMenu/Plugin
-MimeType=application/pdf;application/vnd.oasis.opendocument.chart ;application/vnd.oasis.opendocument.formula ;application/vnd.oasis.opendocument.graphics ;application/vnd.oasis.opendocument.image ;application/vnd.oasis.opendocument.presentation ;application/vnd.oasis.opendocument.spreadsheet ;application/vnd.oasis.opendocument.text ;application/vnd.openxmlformats-officedocument.presentationml.presentation ;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet ;application/vnd.openxmlformats-officedocument.wordprocessingml.document ;application/x-bittorrent ;application/zip ;audio/flac ;audio/mpeg ;audio/ogg ;audio/x-flac ;image/jpeg ;image/png ;image/tiff ;image/x-ms-bmp ;text/plain ;video/mp4 ;video/x-msvideo;
+MimeType=application/pdf;application/vnd.oasis.opendocument.chart;application/vnd.oasis.opendocument.formula;application/vnd.oasis.opendocument.graphics;application/vnd.oasis.opendocument.image;application/vnd.oasis.opendocument.presentation;application/vnd.oasis.opendocument.spreadsheet;application/vnd.oasis.opendocument.text;application/vnd.openxmlformats-officedocument.presentationml.presentation;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;application/vnd.openxmlformats-officedocument.wordprocessingml.document;application/x-bittorrent;application/zip;audio/flac;audio/mpeg;audio/ogg;audio/x-flac;image/jpeg;image/png;image/tiff;image/x-ms-bmp;text/plain;video/mp4;video/x-msvideo;
 Actions=cleanMetadata;
 Type=Service

 [Desktop Action cleanMetadata]
 Name=Clean metadata
+Name[de]=Metadaten löschen
 Name[es]=Limpiar metadatos
 Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
-Exec=kdialog --yesno  "$( mat2 -s %U )" --title "Clean Metadata?" && mat2 %U
+Exec=kdialog --yesno  "$( mat2 -s %F )" --title "Clean Metadata?" && mat2 %U
+Exec[de]=kdialog --yesno  "$( mat2 -s %F )" --title "Metadaten löschen?" && mat2 %U
--- a/libmat2/__init__.py
+++ b/libmat2/__init__.py
@@ -2,15 +2,10 @@

 import enum
 import importlib
-from typing import Dict, Optional, Union
+from typing import Dict

 from . import exiftool, video

-# make pyflakes happy
-assert Dict
-assert Optional
-assert Union
-
 # A set of extension that aren't supported, despite matching a supported mimetype
 UNSUPPORTED_EXTENSIONS = {
    '.asc',
@@ -67,8 +62,9 @@ CMD_DEPENDENCIES = {
    },
 }

+
 def check_dependencies() -> Dict[str, Dict[str, bool]]:
-    ret = dict()  # type: Dict[str, dict]
+    ret: Dict[str, Dict] = dict()

    for key, value in DEPENDENCIES.items():
        ret[key] = {

--- a/libmat2/abstract.py
+++ b/libmat2/abstract.py
 import abc
 import os
 import re
-from typing import Set, Dict, Union
-
-assert Set  # make pyflakes happy
+from typing import Union, Set, Dict


 class AbstractParser(abc.ABC):
@@ -11,8 +9,8 @@ class AbstractParser(abc.ABC):
    It might yield `ValueError` on instantiation on invalid files,
    and `RuntimeError` when something went wrong in `remove_all`.
    """
-    meta_list = set()  # type: Set[str]
-    mimetypes = set()  # type: Set[str]
+    meta_list: Set[str] = set()
+    mimetypes: Set[str] = set()

    def __init__(self, filename: str) -> None:
        """
@@ -35,8 +33,11 @@ class AbstractParser(abc.ABC):
        self.sandbox = True

    @abc.abstractmethod
-    def get_meta(self) -> Dict[str, Union[str, dict]]:
-        """Return all the metadata of the current file"""
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
+        """Return all the metadata of the current file
+
+        :raises RuntimeError: Raised if the cleaning process went wrong.
+        """

    @abc.abstractmethod
    def remove_all(self) -> bool:

--- a/libmat2/archive.py
+++ b/libmat2/archive.py
@@ -7,14 +7,10 @@ import tempfile
 import os
 import logging
 import shutil
-from typing import Dict, Set, Pattern, Union, Any, List
+from typing import Pattern, Union, Any, Set, Dict, List

 from . import abstract, UnknownMemberPolicy, parser_factory

-# Make pyflakes happy
-assert Set
-assert Pattern
-
 # pylint: disable=not-callable,assignment-from-no-return,too-many-branches

 # An ArchiveClass is a class representing an archive,
@@ -48,20 +44,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
    def __init__(self, filename):
        super().__init__(filename)
        # We ignore typing here because mypy is too stupid
-        self.archive_class = None  #  type: ignore
-        self.member_class = None  #  type: ignore
+        self.archive_class = None  # type: ignore
+        self.member_class = None  # type: ignore

        # Those are the files that have a format that _isn't_
        # supported by mat2, but that we want to keep anyway.
-        self.files_to_keep = set()  # type: Set[Pattern]
+        self.files_to_keep: Set[Pattern] = set()

        # Those are the files that we _do not_ want to keep,
        # no matter if they are supported or not.
-        self.files_to_omit = set()  # type: Set[Pattern]
+        self.files_to_omit: Set[Pattern] = set()

        # what should the parser do if it encounters an unknown file in
        # the archive?
-        self.unknown_member_policy = UnknownMemberPolicy.ABORT  # type: UnknownMemberPolicy
+        self.unknown_member_policy: UnknownMemberPolicy = UnknownMemberPolicy.ABORT

        # The LGTM comment is to mask a false-positive,
        # see https://lgtm.com/projects/g/jvoisin/mat2/
@@ -73,20 +69,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
    def _specific_cleanup(self, full_path: str) -> bool:
        """ This method can be used to apply specific treatment
        to files present in the archive."""
-        # pylint: disable=unused-argument,no-self-use
+        # pylint: disable=unused-argument
        return True  # pragma: no cover

    def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
        """ This method can be used to extract specific metadata
        from files present in the archive."""
-        # pylint: disable=unused-argument,no-self-use
+        # pylint: disable=unused-argument
        return {}  # pragma: no cover

    def _final_checks(self) -> bool:
        """ This method is invoked after the file has been cleaned,
        allowing to run final verifications.
        """
-        # pylint: disable=unused-argument,no-self-use
+        # pylint: disable=unused-argument
        return True

    @staticmethod
@@ -109,6 +105,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
    def _get_member_name(member: ArchiveMember) -> str:
        """Return the name of the given member."""

+    @staticmethod
+    @abc.abstractmethod
+    def _is_dir(member: ArchiveMember) -> bool:
+        """Return true is the given member is a directory."""
+
    @abc.abstractmethod
    def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
                             full_path: str):
@@ -120,8 +121,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
        # pylint: disable=unused-argument
        return member

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
-        meta = dict()  # type: Dict[str, Union[str, dict]]
+    @staticmethod
+    def _get_member_compression(member: ArchiveMember):
+        """Get the compression of the archive member."""
+        # pylint: disable=unused-argument
+        return None
+
+    @staticmethod
+    def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
+        """Set the compression of the archive member."""
+        # pylint: disable=unused-argument
+        return member
+
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
+        meta: Dict[str, Union[str, Dict]] = dict()

        with self.archive_class(self.filename) as zin:
            temp_folder = tempfile.mkdtemp()
@@ -130,12 +143,17 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                local_meta = self._get_member_meta(item)
                member_name = self._get_member_name(item)

-                if member_name[-1] == '/':  # pragma: no cover
-                    # `is_dir` is added in Python3.6
+                if self._is_dir(item):  # pragma: no cover
                    continue  # don't keep empty folders

-                zin.extract(member=item, path=temp_folder)
                full_path = os.path.join(temp_folder, member_name)
+                if not os.path.abspath(full_path).startswith(temp_folder):
+                    logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
+                        self.filename, member_name, full_path)
+                    break
+
+                zin.extract(member=item, path=temp_folder)
+
                os.chmod(full_path, stat.S_IRUSR)

                specific_meta = self._specific_get_meta(full_path, member_name)
@@ -143,6 +161,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):

                member_parser, _ = parser_factory.get_parser(full_path)  # type: ignore
                if member_parser:
+                    member_parser.sandbox = self.sandbox
                    local_meta = {**local_meta, **member_parser.get_meta()}

                if local_meta:
@@ -162,12 +181,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):

            # Sort the items to process, to reduce fingerprinting,
            # and keep them in the `items` variable.
-            items = list()  # type: List[ArchiveMember]
+            items: List[ArchiveMember] = list()
            for item in sorted(self._get_all_members(zin), key=self._get_member_name):
                # Some fileformats do require to have the `mimetype` file
                # as the first file in the archive.
                if self._get_member_name(item) == 'mimetype':
-                    items = [item] + items
+                    items.insert(0, item)
                else:
                    items.append(item)

@@ -175,18 +194,36 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
            # we're iterating (and thus inserting) them in lexicographic order.
            for item in items:
                member_name = self._get_member_name(item)
-                if member_name[-1] == '/':  # `is_dir` is added in Python3.6
+                if self._is_dir(item):
                    continue  # don't keep empty folders

-                zin.extract(member=item, path=temp_folder)
                full_path = os.path.join(temp_folder, member_name)
+                if not os.path.abspath(full_path).startswith(temp_folder):
+                    logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
+                            self.filename, member_name, full_path)
+                    abort = True
+                    break
+
+                zin.extract(member=item, path=temp_folder)
+
+                try:
+                    original_permissions = os.stat(full_path).st_mode
+                except FileNotFoundError:
+                    logging.error("Something went wrong during processing of "
+                            "%s in %s, likely a path traversal attack.",
+                            member_name, self.filename)
+                    abort = True
+                    # we're breaking instead of continuing, because this exception
+                    # is raised in case of weird path-traversal-like atttacks.
+                    break

-                original_permissions = os.stat(full_path).st_mode
                os.chmod(full_path, original_permissions | stat.S_IWUSR | stat.S_IRUSR)

+                original_compression = self._get_member_compression(item)
+
                if self._specific_cleanup(full_path) is False:
-                    logging.warning("Something went wrong during deep cleaning of %s",
-                                    member_name)
+                    logging.warning("Something went wrong during deep cleaning of %s in %s",
+                                    member_name, self.filename)
                    abort = True
                    continue

@@ -212,6 +249,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
                            abort = True
                            continue
                    else:
+                        member_parser.sandbox = self.sandbox
                        if member_parser.remove_all() is False:
                            logging.warning("In file %s, something went wrong \
                                             with the cleaning of %s \
@@ -223,6 +261,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):

                zinfo = self.member_class(member_name)  # type: ignore
                zinfo = self._set_member_permissions(zinfo, original_permissions)
+                zinfo = self._set_member_compression(zinfo, original_compression)
                clean_zinfo = self._clean_member(zinfo)
                self._add_file_to_archive(zout, clean_zinfo, full_path)

@@ -237,6 +276,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):

 class TarParser(ArchiveBasedAbstractParser):
    mimetypes = {'application/x-tar'}
+
    def __init__(self, filename):
        super().__init__(filename)
        # yes, it's tarfile.open and not tarfile.TarFile,
@@ -346,6 +386,11 @@ class TarParser(ArchiveBasedAbstractParser):
        member.mode = permissions
        return member

+    @staticmethod
+    def _is_dir(member: ArchiveMember) -> bool:
+        assert isinstance(member, tarfile.TarInfo)  # please mypy
+        return member.isdir()
+

 class TarGzParser(TarParser):
    compression = ':gz'
@@ -364,16 +409,17 @@ class TarXzParser(TarParser):

 class ZipParser(ArchiveBasedAbstractParser):
    mimetypes = {'application/zip'}
-    def __init__(self, filename):
+
+    def __init__(self, filename: str):
        super().__init__(filename)
        self.archive_class = zipfile.ZipFile
        self.member_class = zipfile.ZipInfo
-        self.zip_compression_type = zipfile.ZIP_DEFLATED

    def is_archive_valid(self):
        try:
-            zipfile.ZipFile(self.filename)
-        except zipfile.BadZipFile:
+            with zipfile.ZipFile(self.filename):
+                pass
+        except (zipfile.BadZipFile, OSError):
            raise ValueError

    @staticmethod
@@ -409,7 +455,7 @@ class ZipParser(ArchiveBasedAbstractParser):
        assert isinstance(member, zipfile.ZipInfo)  # please mypy
        with open(full_path, 'rb') as f:
            archive.writestr(member, f.read(),
-                             compress_type=self.zip_compression_type)
+                             compress_type=member.compress_type)

    @staticmethod
    def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
@@ -420,3 +466,19 @@ class ZipParser(ArchiveBasedAbstractParser):
    def _get_member_name(member: ArchiveMember) -> str:
        assert isinstance(member, zipfile.ZipInfo)  # please mypy
        return member.filename
+
+    @staticmethod
+    def _get_member_compression(member: ArchiveMember):
+        assert isinstance(member, zipfile.ZipInfo)  # please mypy
+        return member.compress_type
+
+    @staticmethod
+    def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
+        assert isinstance(member, zipfile.ZipInfo)  # please mypy
+        member.compress_type = compression
+        return member
+
+    @staticmethod
+    def _is_dir(member: ArchiveMember) -> bool:
+        assert isinstance(member, zipfile.ZipInfo)  # please mypy
+        return member.is_dir()
--- a/libmat2/audio.py
+++ b/libmat2/audio.py
@@ -2,7 +2,7 @@ import mimetypes
 import os
 import shutil
 import tempfile
-from typing import Dict, Union
+from typing import Union, Dict

 import mutagen

@@ -13,33 +13,40 @@ class MutagenParser(abstract.AbstractParser):
    def __init__(self, filename):
        super().__init__(filename)
        try:
-            mutagen.File(self.filename)
+            if mutagen.File(self.filename) is None:
+                raise ValueError
        except mutagen.MutagenError:
            raise ValueError

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        f = mutagen.File(self.filename)
        if f.tags:
-            return {k:', '.join(v) for k, v in f.tags.items()}
+            return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
        return {}

    def remove_all(self) -> bool:
        shutil.copy(self.filename, self.output_filename)
        f = mutagen.File(self.output_filename)
-        f.delete()
-        f.save()
+        try:
+            f.delete()
+            f.save()
+        except mutagen.MutagenError:
+            raise ValueError
        return True


 class MP3Parser(MutagenParser):
    mimetypes = {'audio/mpeg', }

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
-        metadata = {}  # type: Dict[str, Union[str, dict]]
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
+        metadata: Dict[str, Union[str, Dict]] = dict()
        meta = mutagen.File(self.filename).tags
        if not meta:
            return metadata
        for key in meta:
+            if isinstance(key, tuple):
+                metadata[key[0]] = key[1]
+                continue
            if not hasattr(meta[key], 'text'):  # pragma: no cover
                continue
            metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
@@ -61,12 +68,12 @@ class FLACParser(MutagenParser):
        f.save(deleteid3=True)
        return True

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        meta = super().get_meta()
        for num, picture in enumerate(mutagen.File(self.filename).pictures):
            name = picture.desc if picture.desc else 'Cover %d' % num
            extension = mimetypes.guess_extension(picture.mime)
-            if extension is None:  #  pragma: no cover
+            if extension is None: #  pragma: no cover
                meta[name] = 'harmful data'
                continue

@@ -75,6 +82,9 @@ class FLACParser(MutagenParser):
            with open(fname, 'wb') as f:
                f.write(picture.data)
            p, _ = parser_factory.get_parser(fname)  # type: ignore
+            if p is None:
+                raise ValueError
+            p.sandbox = self.sandbox
            # Mypy chokes on ternaries :/
            meta[name] = p.get_meta() if p else 'harmful data'  # type: ignore
            os.remove(fname)
@@ -90,3 +100,15 @@ class WAVParser(video.AbstractFFmpegParser):
                      'FileSize', 'FileType', 'FileTypeExtension',
                      'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
                     }
+
+
+class AIFFParser(video.AbstractFFmpegParser):
+    mimetypes = {'audio/aiff', 'audio/x-aiff'}
+    meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
+                      'Duration', 'Encoding', 'ExifToolVersion',
+                      'FileAccessDate', 'FileInodeChangeDate',
+                      'FileModifyDate', 'FileName', 'FilePermissions',
+                      'FileSize', 'FileType', 'FileTypeExtension',
+                      'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
+                      'NumSampleFrames', 'SampleSize',
+                     }
--- a/libmat2/bubblewrap.py
+++ b/libmat2/bubblewrap.py
@@ -11,7 +11,8 @@ import os
 import shutil
 import subprocess
 import tempfile
-from typing import List, Optional
+import functools
+from typing import Optional, List


 __all__ = ['PIPE', 'run', 'CalledProcessError']
@@ -21,6 +22,7 @@ CalledProcessError = subprocess.CalledProcessError
 # pylint: disable=subprocess-run-check


+@functools.lru_cache(maxsize=None)
 def _get_bwrap_path() -> str:
    which_path = shutil.which('bwrap')
    if which_path:
@@ -29,7 +31,6 @@ def _get_bwrap_path() -> str:
    raise RuntimeError("Unable to find bwrap")  # pragma: no cover


-# pylint: disable=bad-whitespace
 def _get_bwrap_args(tempdir: str,
                    input_filename: str,
                    output_filename: Optional[str] = None) -> List[str]:
@@ -38,7 +39,7 @@ def _get_bwrap_args(tempdir: str,

    # XXX: use --ro-bind-try once all supported platforms
    # have a bubblewrap recent enough to support it.
-    ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', cwd]
+    ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', '/etc/alternatives', cwd]
    for bind_dir in ro_bind_dirs:
        if os.path.isdir(bind_dir):  # pragma: no cover
            ro_bind_args.extend(['--ro-bind', bind_dir, bind_dir])
@@ -77,7 +78,6 @@ def _get_bwrap_args(tempdir: str,
    return args


-# pylint: disable=bad-whitespace
 def run(args: List[str],
        input_filename: str,
        output_filename: Optional[str] = None,

--- a/libmat2/epub.py
+++ b/libmat2/epub.py
 import logging
 import re
 import uuid
+import zipfile
 import xml.etree.ElementTree as ET  # type: ignore
+from typing import Any, Dict

 from . import archive, office

+
 class EPUBParser(archive.ZipParser):
    mimetypes = {'application/epub+zip', }
    metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
@@ -15,11 +18,27 @@ class EPUBParser(archive.ZipParser):
            'META-INF/container.xml',
            'mimetype',
            'OEBPS/content.opf',
+            'content.opf',
+            'hmh.opf',
+            'OPS/.+.xml'
            }))
+        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            'iTunesMetadata.plist',
+            'META-INF/calibre_bookmarks.txt',
+            'OEBPS/package.opf',
+             }))
        self.uniqid = uuid.uuid4()

-    def _specific_get_meta(self, full_path, file_path):
-        if file_path != 'OEBPS/content.opf':
+    def is_archive_valid(self):
+        super().is_archive_valid()
+        with zipfile.ZipFile(self.filename) as zin:
+            for item in self._get_all_members(zin):
+                member_name = self._get_member_name(item)
+                if member_name.endswith('META-INF/encryption.xml'):
+                    raise ValueError('the file contains encrypted fonts')
+
+    def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
+        if not file_path.endswith('.opf'):
            return {}

        with open(full_path, encoding='utf-8') as f:
@@ -30,14 +49,31 @@ class EPUBParser(archive.ZipParser):
            except (TypeError, UnicodeDecodeError):
                return {file_path: 'harmful content', }

-    def _specific_cleanup(self, full_path: str):
-        if full_path.endswith('OEBPS/content.opf'):
+    def _specific_cleanup(self, full_path: str) -> bool:
+        if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
            return self.__handle_contentopf(full_path)
        elif full_path.endswith('OEBPS/toc.ncx'):
            return self.__handle_tocncx(full_path)
+        elif re.search('/OPS/[^/]+.xml$', full_path):
+            return self.__handle_ops_xml(full_path)
+        return True
+
+    def __handle_ops_xml(self, full_path: str) -> bool:
+        try:
+            tree, namespace = office._parse_xml(full_path)
+        except ET.ParseError:  # pragma: nocover
+            logging.error("Unable to parse %s in %s.", full_path, self.filename)
+            return False
+
+        for item in tree.iterfind('.//', namespace):  # pragma: nocover
+            if item.tag.strip().lower().endswith('head'):
+                item.clear()
+                break
+        tree.write(full_path, xml_declaration=True, encoding='utf-8',
+                   short_empty_elements=False)
        return True

-    def __handle_tocncx(self, full_path: str):
+    def __handle_tocncx(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:  # pragma: nocover
@@ -53,7 +89,7 @@ class EPUBParser(archive.ZipParser):
                   short_empty_elements=False)
        return True

-    def __handle_contentopf(self, full_path: str):
+    def __handle_contentopf(self, full_path: str) -> bool:
        try:
            tree, namespace = office._parse_xml(full_path)
        except ET.ParseError:
@@ -71,7 +107,7 @@ class EPUBParser(archive.ZipParser):
                item.append(uniqid)

                # items without mandatory content
-                for name in {'language', 'title'}:
+                for name in ['language', 'title']:
                    uniqid = ET.Element(self.metadata_namespace + name)
                    item.append(uniqid)
                break  # there is only a single <metadata> block

--- a/libmat2/exiftool.py
+++ b/libmat2/exiftool.py
@@ -4,23 +4,20 @@ import logging
 import os
 import shutil
 import subprocess
-from typing import Dict, Union, Set
+from typing import Union, Set, Dict

 from . import abstract
 from . import bubblewrap

-# Make pyflakes happy
-assert Set
-

 class ExiftoolParser(abstract.AbstractParser):
    """ Exiftool is often the easiest way to get all the metadata
    from a import file, hence why several parsers are re-using its `get_meta`
    method.
    """
-    meta_allowlist = set()  # type: Set[str]
+    meta_allowlist: Set[str] = set()

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        try:
            if self.sandbox:
                out = bubblewrap.run([_get_exiftool_path(), '-json',
@@ -70,7 +67,7 @@ class ExiftoolParser(abstract.AbstractParser):
            return False
        return True

-@functools.lru_cache()
+@functools.lru_cache(maxsize=None)
 def _get_exiftool_path() -> str:  # pragma: no cover
    which_path = shutil.which('exiftool')
    if which_path:

--- a/libmat2/harmless.py
+++ b/libmat2/harmless.py
 import shutil
-from typing import Dict, Union
+from typing import Union, Dict
 from . import abstract


 class HarmlessParser(abstract.AbstractParser):
    """ This is the parser for filetypes that can not contain metadata. """
-    mimetypes = {'text/plain', 'image/x-ms-bmp'}
+    mimetypes = {'text/plain', 'image/x-ms-bmp', 'image/bmp'}

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        return dict()

    def remove_all(self) -> bool:

--- a/libmat2/images.py
+++ b/libmat2/images.py
-import imghdr
 import os
 import re
-from typing import Set, Dict, Union, Any
+from typing import Union, Any, Dict

 import cairo

@@ -12,10 +11,6 @@ from gi.repository import GdkPixbuf, GLib, Rsvg

 from . import exiftool, abstract

-# Make pyflakes happy
-assert Set
-assert Any
-
 class SVGParser(exiftool.ExiftoolParser):
    mimetypes = {'image/svg+xml', }
    meta_allowlist = {'Directory', 'ExifToolVersion', 'FileAccessDate',
@@ -26,17 +21,31 @@ class SVGParser(exiftool.ExiftoolParser):
                      }

    def remove_all(self) -> bool:
-        svg = Rsvg.Handle.new_from_file(self.filename)
-        dimensions = svg.get_dimensions()
-        surface = cairo.SVGSurface(self.output_filename,
-                                   dimensions.height,
-                                   dimensions.width)
+        try:
+            svg = Rsvg.Handle.new_from_file(self.filename)
+        except GLib.GError:
+            raise ValueError
+
+        try:
+            _, _, _, _, has_viewbox, viewbox = svg.get_intrinsic_dimensions()
+            if has_viewbox is False:
+                raise ValueError
+            _, width, height = svg.get_intrinsic_size_in_pixels()
+        except AttributeError:
+            dimensions = svg.get_dimensions()
+            height, width = dimensions.height, dimensions.width
+
+        surface = cairo.SVGSurface(self.output_filename, height, width)
        context = cairo.Context(surface)
-        svg.render_cairo(context)
+        try:
+            svg.render_document(context, viewbox)
+        except AttributeError:
+            svg.render_cairo(context)
+
        surface.finish()
        return True

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        meta = super().get_meta()

        # The namespace is mandatory, but only the …/2000/svg is valid.
@@ -45,6 +54,7 @@ class SVGParser(exiftool.ExiftoolParser):
            meta.pop('Xmlns')
        return meta

+
 class PNGParser(exiftool.ExiftoolParser):
    mimetypes = {'image/png', }
    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
@@ -58,12 +68,9 @@ class PNGParser(exiftool.ExiftoolParser):
    def __init__(self, filename):
        super().__init__(filename)

-        if imghdr.what(filename) != 'png':
-            raise ValueError
-
        try:  # better fail here than later
            cairo.ImageSurface.create_from_png(self.filename)
-        except Exception:  # pragma: no cover
+        except:  # pragma: no cover
            # Cairo is returning some weird exceptions :/
            raise ValueError

@@ -98,7 +105,6 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):

    def __init__(self, filename):
        super().__init__(filename)
-        # we can't use imghdr here because of https://bugs.python.org/issue28591
        try:
            GdkPixbuf.Pixbuf.new_from_file(self.filename)
        except GLib.GError:
@@ -110,6 +116,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):

        _, extension = os.path.splitext(self.filename)
        pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
+        pixbuf = GdkPixbuf.Pixbuf.apply_embedded_orientation(pixbuf)
        if extension.lower() == '.jpg':
            extension = '.jpeg'  # gdk is picky
        elif extension.lower() == '.tif':
@@ -132,7 +139,7 @@ class JPGParser(GdkPixbufAbstractParser):
                      'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
                      'ColorComponents', 'EncodingProcess', 'JFIFVersion',
                      'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
-                      'YResolution', 'Megapixels', 'ImageHeight'}
+                      'YResolution', 'Megapixels', 'ImageHeight', 'Orientation'}


 class TiffParser(GdkPixbufAbstractParser):
@@ -146,13 +153,14 @@ class TiffParser(GdkPixbufAbstractParser):
                      'FileInodeChangeDate', 'FileModifyDate', 'FileName',
                      'FilePermissions', 'FileSize', 'FileType',
                      'FileTypeExtension', 'ImageHeight', 'ImageSize',
-                      'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'}
+                      'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile', 'Orientation'}
+

 class PPMParser(abstract.AbstractParser):
    mimetypes = {'image/x-portable-pixmap'}

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
-        meta = {}  # type: Dict[str, Union[str, Dict[Any, Any]]]
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
+        meta: Dict[str, Union[str, Dict[Any, Any]]] = dict()
        with open(self.filename) as f:
            for idx, line in enumerate(f):
                if line.lstrip().startswith('#'):
@@ -167,3 +175,36 @@ class PPMParser(abstract.AbstractParser):
                        line = re.sub(r"\s+", "", line, flags=re.UNICODE)
                        fout.write(line)
        return True
+
+
+class HEICParser(exiftool.ExiftoolParser):
+    mimetypes = {'image/heic'}
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
+            'FileSize', 'FileModifyDate', 'FileAccessDate',
+            'FileInodeChangeDate', 'FilePermissions', 'FileType',
+            'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion',
+            'CompatibleBrands','HandlerType', 'PrimaryItemReference',
+            'HEVCConfigurationVersion', 'GeneralProfileSpace',
+            'GeneralTierFlag', 'GeneralProfileIDC',
+            'GenProfileCompatibilityFlags', 'ConstraintIndicatorFlags',
+            'GeneralLevelIDC', 'MinSpatialSegmentationIDC',
+            'ParallelismType','ChromaFormat', 'BitDepthLuma', 'BitDepthChroma',
+            'NumTemporalLayers', 'TemporalIDNested', 'ImageWidth',
+            'ImageHeight', 'ImageSpatialExtent', 'ImagePixelDepth',
+            'AverageFrameRate', 'ConstantFrameRate', 'MediaDataSize',
+            'MediaDataOffset','ImageSize', 'Megapixels'}
+
+    def remove_all(self) -> bool:
+        return self._lightweight_cleanup()
+
+class WEBPParser(GdkPixbufAbstractParser):
+    mimetypes = {'image/webp'}
+    meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
+                      'Directory', 'FileSize', 'FileModifyDate',
+                      'FileAccessDate', "FileInodeChangeDate",
+                      'FilePermissions', 'FileType', 'FileTypeExtension',
+                      'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
+                      'ColorComponents', 'EncodingProcess', 'JFIFVersion',
+                      'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
+                      'YResolution', 'Megapixels', 'ImageHeight', 'Orientation',
+                      'HorizontalScale', 'VerticalScale', 'VP8Version'}
--- a/libmat2/office.py
+++ b/libmat2/office.py
@@ -4,7 +4,7 @@ import logging
 import os
 import re
 import zipfile
-from typing import Dict, Set, Pattern, Tuple, Any
+from typing import Pattern, Any, Tuple, Dict

 import xml.etree.ElementTree as ET  # type: ignore

@@ -12,9 +12,6 @@ from .archive import ZipParser

 # pylint: disable=line-too-long

-# Make pyflakes happy
-assert Set
-assert Pattern

 def _parse_xml(full_path: str) -> Tuple[ET.ElementTree, Dict[str, str]]:
    """ This function parses XML, with namespace support. """
@@ -41,7 +38,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
    for c in tree.getroot():
        c[:] = sorted(c, key=lambda child: (child.tag, child.get('desc')))

-    tree.write(full_path, xml_declaration=True)
+    tree.write(full_path, xml_declaration=True, encoding='utf-8')
    return True


@@ -66,13 +63,24 @@ class MSOfficeParser(ZipParser):
        'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',  # /word/footer.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml',  # /word/header.xml
        'application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml',  # /word/styles.xml
+        'application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml',  # /word/numbering.xml (used for bullet point formatting)
+        'application/vnd.openxmlformats-officedocument.theme+xml',  # /word/theme/theme[0-9].xml (used for font and background coloring, etc.)
        'application/vnd.openxmlformats-package.core-properties+xml',  # /docProps/core.xml

+        # for more complicated powerpoints
+        'application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml',
+        'application/vnd.openxmlformats-officedocument.presentationml.notesMaster+xml',
+        'application/vnd.openxmlformats-officedocument.presentationml.handoutMaster+xml',
+        'application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml',
+        'application/vnd.openxmlformats-officedocument.drawingml.diagramLayout+xml',
+        'application/vnd.openxmlformats-officedocument.drawingml.diagramStyle+xml',
+        'application/vnd.openxmlformats-officedocument.drawingml.diagramColors+xml',
+        'application/vnd.ms-office.drawingml.diagramDrawing+xml',
+
        # Do we want to keep the following ones?
        'application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml',
    }

-
    def __init__(self, filename):
        super().__init__(filename)

@@ -87,9 +95,15 @@ class MSOfficeParser(ZipParser):
        self.files_to_keep = set(map(re.compile, {  # type: ignore
            r'^\[Content_Types\]\.xml$',
            r'^_rels/\.rels$',
-            r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$',
+            r'^xl/sharedStrings\.xml$',  # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
+            r'^xl/calcChain\.xml$',
+            r'^(?:word|ppt|xl)/_rels/(document|workbook|presentation)\.xml\.rels$',
            r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
            r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
+            r'^(?:word|ppt|xl)/charts/_rels/chart[0-9]+\.xml\.rels$',
+            r'^(?:word|ppt|xl)/charts/colors[0-9]+\.xml$',
+            r'^(?:word|ppt|xl)/charts/style[0-9]+\.xml$',
+            r'^(?:word|ppt|xl)/drawings/_rels/drawing[0-9]+\.xml\.rels$',
            r'^(?:word|ppt|xl)/styles\.xml$',
            # TODO: randomize axId ( https://docs.microsoft.com/en-us/openspecs/office_standards/ms-oi29500/089f849f-fcd6-4fa0-a281-35aa6a432a16 )
            r'^(?:word|ppt|xl)/charts/chart[0-9]*\.xml$',
@@ -98,6 +112,7 @@ class MSOfficeParser(ZipParser):
            r'^ppt/slideLayouts/_rels/slideLayout[0-9]+\.xml\.rels$',
            r'^ppt/slideLayouts/slideLayout[0-9]+\.xml$',
            r'^(?:word|ppt|xl)/tableStyles\.xml$',
+            r'^(?:word|ppt|xl)/tables/table[0-9]+\.xml$',
            r'^ppt/slides/_rels/slide[0-9]*\.xml\.rels$',
            r'^ppt/slides/slide[0-9]*\.xml$',
            # https://msdn.microsoft.com/en-us/library/dd908153(v=office.12).aspx
@@ -106,25 +121,41 @@ class MSOfficeParser(ZipParser):
            # TODO: check if p:bgRef can be randomized
            r'^ppt/slideMasters/slideMaster[0-9]+\.xml',
            r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
+            r'^xl/worksheets/_rels/sheet[0-9]+\.xml\.rels',
+            r'^(?:word|ppt|xl)/drawings/vmlDrawing[0-9]+\.vml',
+            r'^(?:word|ppt|xl)/drawings/drawing[0-9]+\.xml',
+            r'^(?:word|ppt|xl)/embeddings/Microsoft_Excel_Worksheet[0-9]+\.xlsx',
+            # rels for complicated powerpoints
+            r'^ppt/notesSlides/_rels/notesSlide[0-9]+\.xml\.rels',
+            r'^ppt/notesMasters/_rels/notesMaster[0-9]+\.xml\.rels',
+            r'^ppt/handoutMasters/_rels/handoutMaster[0-9]+\.xml\.rels',
        }))
        self.files_to_omit = set(map(re.compile, {  # type: ignore
+            r'^\[trash\]/',
            r'^customXml/',
            r'webSettings\.xml$',
            r'^docProps/custom\.xml$',
            r'^(?:word|ppt|xl)/printerSettings/',
            r'^(?:word|ppt|xl)/theme',
            r'^(?:word|ppt|xl)/people\.xml$',
+            r'^(?:word|ppt|xl)/persons/person\.xml$',
            r'^(?:word|ppt|xl)/numbering\.xml$',
            r'^(?:word|ppt|xl)/tags/',
+            r'^(?:word|ppt|xl)/glossary/',
            # View properties like view mode, last viewed slide etc
            r'^(?:word|ppt|xl)/viewProps\.xml$',
            # Additional presentation-wide properties like printing properties,
            # presentation show properties etc.
            r'^(?:word|ppt|xl)/presProps\.xml$',
-
+            r'^(?:word|ppt|xl)/comments[0-9]*\.xml$',
+            r'^(?:word|ppt|xl)/threadedComments/threadedComment[0-9]*\.xml$',
+            r'^(?:word|ppt|xl)/commentsExtended\.xml$',
+            r'^(?:word|ppt|xl)/commentsExtensible\.xml$',
+            r'^(?:word|ppt|xl)/commentsIds\.xml$',
            # we have an allowlist in self.files_to_keep,
            # so we can trash everything else
            r'^(?:word|ppt|xl)/_rels/',
+            r'docMetadata/LabelInfo\.xml$'
        }))

        if self.__fill_files_to_keep_via_content_types() is False:
@@ -141,13 +172,13 @@ class MSOfficeParser(ZipParser):
                return False
            xml_data = zin.read('[Content_Types].xml')

-        self.content_types = dict()  # type: Dict[str, str]
+        self.content_types: Dict[str, str] = dict()
        try:
            tree = ET.fromstring(xml_data)
        except ET.ParseError:
            return False
        for c in tree:
-            if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:
+            if 'PartName' not in c.attrib or 'ContentType' not in c.attrib:  # pragma: no cover
                continue
            elif c.attrib['ContentType'] in self.content_types_to_keep:
                fname = c.attrib['PartName'][1:]  # remove leading `/`
@@ -172,7 +203,7 @@ class MSOfficeParser(ZipParser):
            return False

        # rsid, tags or attributes, are always under the `w` namespace
-        if 'w' not in namespace.keys():
+        if 'w' not in namespace:
            return True

        parent_map = {c:p for p in tree.iter() for c in p}
@@ -189,7 +220,7 @@ class MSOfficeParser(ZipParser):
        for element in elements_to_remove:
            parent_map[element].remove(element)

-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    @staticmethod
@@ -208,10 +239,10 @@ class MSOfficeParser(ZipParser):
            return False

        # The nsid tag is always under the `w` namespace
-        if 'w' not in namespace.keys():
+        if 'w' not in namespace:
            return True

-        parent_map = {c:p for p in tree.iter() for c in p}
+        parent_map = {c: p for p in tree.iter() for c in p}

        elements_to_remove = list()
        for element in tree.iterfind('.//w:nsid', namespace):
@@ -219,10 +250,9 @@ class MSOfficeParser(ZipParser):
        for element in elements_to_remove:
            parent_map[element].remove(element)

-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

-
    @staticmethod
    def __remove_revisions(full_path: str) -> bool:
        try:
@@ -253,11 +283,82 @@ class MSOfficeParser(ZipParser):
                    for children in element.iterfind('./*'):
                        elements_ins.append((element, position, children))
                    break
+
        for (element, position, children) in elements_ins:
            parent_map[element].insert(position, children)
+
+        # the list can sometimes contain duplicate elements, so don't remove
+        # until all children have been processed
+        for (element, position, children) in elements_ins:
+            if element in parent_map[element]:
+                parent_map[element].remove(element)
+
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
+
+    @staticmethod
+    def __remove_document_comment_meta(full_path: str) -> bool:
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        # search the docs to see if we can bail early
+        range_start = tree.find('.//w:commentRangeStart', namespace)
+        range_end = tree.find('.//w:commentRangeEnd', namespace)
+        references = tree.find('.//w:commentReference', namespace)
+        if range_start is None and range_end is None and references is None:
+            return True  # No comment meta tags are present
+
+        parent_map = {c:p for p in tree.iter() for c in p}
+
+        # iterate over the elements and add them to list
+        elements_del = list()
+        for element in tree.iterfind('.//w:commentRangeStart', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentRangeEnd', namespace):
+            elements_del.append(element)
+        for element in tree.iterfind('.//w:commentReference', namespace):
+            elements_del.append(element)
+
+        # remove the elements
+        for element in elements_del:
            parent_map[element].remove(element)

-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
+        return True
+
+    def __remove_document_xml_rels_members(self, full_path: str) -> bool:
+        """ Remove the dangling references from the word/_rels/document.xml.rels file, since MS office doesn't like them.
+        """
+        try:
+            tree, namespace = _parse_xml(full_path)
+        except ET.ParseError as e:  # pragma: no cover
+            logging.error("Unable to parse %s: %s", full_path, e)
+            return False
+
+        if len(namespace.items()) != 1:  # pragma: no cover
+            logging.debug("Got several namespaces for Types: %s", namespace.items())
+
+        removed_fnames = set()
+        with zipfile.ZipFile(self.filename) as zin:
+            for fname in [item.filename for item in zin.infolist()]:
+                for file_to_omit in self.files_to_omit:
+                    if file_to_omit.search(fname):
+                        matches = map(lambda r: r.search(fname), self.files_to_keep)
+                        if any(matches):  # the file is in the allowlist
+                            continue
+                        removed_fnames.add(fname)
+                        break
+
+        root = tree.getroot()
+        for item in root.findall('{%s}Relationship' % namespace['']):
+            name = 'word/' + item.attrib['Target'] # add the word/ prefix to the path, since all document rels are in the word/ directory
+            if name in removed_fnames:
+                root.remove(item)
+
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def __remove_content_type_members(self, full_path: str) -> bool:
@@ -270,7 +371,7 @@ class MSOfficeParser(ZipParser):
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

-        if len(namespace.items()) != 1:
+        if len(namespace.items()) != 1:  # pragma: no cover
            logging.debug("Got several namespaces for Types: %s", namespace.items())

        removed_fnames = set()
@@ -290,7 +391,7 @@ class MSOfficeParser(ZipParser):
            if name in removed_fnames:
                root.remove(item)

-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def _final_checks(self) -> bool:
@@ -312,7 +413,6 @@ class MSOfficeParser(ZipParser):
            for i in re.findall(r'<p:cNvPr id="([0-9]+)"', content):
                self.__counters['cNvPr'].add(int(i))

-
    @staticmethod
    def __randomize_creationId(full_path: str) -> bool:
        try:
@@ -321,12 +421,12 @@ class MSOfficeParser(ZipParser):
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

-        if 'p14' not in namespace.keys():
+        if 'p14' not in namespace:
            return True  # pragma: no cover

        for item in tree.iterfind('.//p14:creationId', namespace):
            item.set('val', '%s' % random.randint(0, 2**32))
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    @staticmethod
@@ -337,12 +437,12 @@ class MSOfficeParser(ZipParser):
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

-        if 'p' not in namespace.keys():
+        if 'p' not in namespace:
            return True  # pragma: no cover

        for item in tree.iterfind('.//p:sldMasterId', namespace):
            item.set('id', '%s' % random.randint(0, 2**32))
-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def _specific_cleanup(self, full_path: str) -> bool:
@@ -350,7 +450,7 @@ class MSOfficeParser(ZipParser):
        if os.stat(full_path).st_size == 0:  # Don't process empty files
            return True

-        if not full_path.endswith('.xml'):
+        if not full_path.endswith(('.xml', '.xml.rels')):
            return True

        if self.__randomize_creationId(full_path) is False:
@@ -361,12 +461,19 @@ class MSOfficeParser(ZipParser):
        if full_path.endswith('/[Content_Types].xml'):
            # this file contains references to files that we might
            # remove, and MS Office doesn't like dangling references
-            if self.__remove_content_type_members(full_path) is False:
+            if self.__remove_content_type_members(full_path) is False:  # pragma: no cover
                return False
        elif full_path.endswith('/word/document.xml'):
            # this file contains the revisions
            if self.__remove_revisions(full_path) is False:
                return False  # pragma: no cover
+            # remove comment references and ranges
+            if self.__remove_document_comment_meta(full_path) is False:
+                return False  # pragma: no cover
+        elif full_path.endswith('/word/_rels/document.xml.rels'):
+            # similar to the above, but for the document.xml.rels file
+            if self.__remove_document_xml_rels_members(full_path) is False:  # pragma: no cover
+                return False
        elif full_path.endswith('/docProps/app.xml'):
            # This file must be present and valid,
            # so we're removing as much as we can.
@@ -418,7 +525,7 @@ class MSOfficeParser(ZipParser):
        # see: https://docs.microsoft.com/en-us/dotnet/framework/wpf/advanced/mc-ignorable-attribute
        with open(full_path, 'rb') as f:
            text = f.read()
-            out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, 1)
+            out = re.sub(b'mc:Ignorable="[^"]*"', b'', text, count=1)
        with open(full_path, 'wb') as f:
            f.write(out)

@@ -434,8 +541,8 @@ class MSOfficeParser(ZipParser):

        with open(full_path, encoding='utf-8') as f:
            try:
-                results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
-                return {k:v for (k, v) in results}
+                results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I | re.M)
+                return {k: v for (k, v) in results}
            except (TypeError, UnicodeDecodeError):
                # We didn't manage to parse the xml file
                return {file_path: 'harmful content', }
@@ -452,7 +559,6 @@ class LibreOfficeParser(ZipParser):
        'application/vnd.oasis.opendocument.image',
    }

-
    def __init__(self, filename):
        super().__init__(filename)

@@ -479,14 +585,14 @@ class LibreOfficeParser(ZipParser):
            logging.error("Unable to parse %s: %s", full_path, e)
            return False

-        if 'office' not in namespace.keys():  # no revisions in the current file
+        if 'office' not in namespace:  # no revisions in the current file
            return True

        for text in tree.getroot().iterfind('.//office:text', namespace):
            for changes in text.iterfind('.//text:tracked-changes', namespace):
                text.remove(changes)

-        tree.write(full_path, xml_declaration=True)
+        tree.write(full_path, xml_declaration=True, encoding='utf-8')
        return True

    def _specific_cleanup(self, full_path: str) -> bool:

--- a/libmat2/parser_factory.py
+++ b/libmat2/parser_factory.py
-import logging
 import glob
 import os
 import mimetypes
 import importlib
-from typing import TypeVar, List, Tuple, Optional
+from typing import TypeVar, Optional, List, Tuple

 from . import abstract, UNSUPPORTED_EXTENSIONS

@@ -12,6 +11,10 @@ T = TypeVar('T', bound='abstract.AbstractParser')
 mimetypes.add_type('application/epub+zip', '.epub')
 mimetypes.add_type('application/x-dtbncx+xml', '.ncx')  # EPUB Navigation Control XML File

+# This should be removed after we move to python3.10
+# https://github.com/python/cpython/commit/20a5b7e986377bdfd929d7e8c4e3db5847dfdb2d
+mimetypes.add_type('image/heic', '.heic')
+

 def __load_all_parsers():
    """ Loads every parser in a dynamic way """
@@ -40,7 +43,10 @@ def _get_parsers() -> List[T]:


 def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
-    """ Return the appropriate parser for a given filename. """
+    """ Return the appropriate parser for a given filename.
+
+        :raises ValueError: Raised if the instantiation of the parser went wrong.
+    """
    mtype, _ = mimetypes.guess_type(filename)

    _, extension = os.path.splitext(filename)
@@ -53,10 +59,6 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:

    for parser_class in _get_parsers():  # type: ignore
        if mtype in parser_class.mimetypes:
-            try:
-                return parser_class(filename), mtype
-            except ValueError as e:
-                logging.info("Got an exception when trying to instantiate "
-                             "%s for %s: %s", parser_class, filename, e)
-                return None, mtype
+            # This instantiation might raise a ValueError on malformed files
+            return parser_class(filename), mtype
    return None, mtype
--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -7,8 +7,7 @@ import re
 import logging
 import tempfile
 import io
-from typing import Dict, Union
-from distutils.version import LooseVersion
+from typing import Union, Dict

 import cairo
 import gi
@@ -17,10 +16,7 @@ from gi.repository import Poppler, GLib

 from . import abstract

-poppler_version = Poppler.get_version()
-if LooseVersion(poppler_version) < LooseVersion('0.46'):  # pragma: no cover
-    raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
-The installed version is %s." % poppler_version)  # pragma: no cover
+FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5


 class PDFParser(abstract.AbstractParser):
@@ -32,7 +28,7 @@ class PDFParser(abstract.AbstractParser):
    def __init__(self, filename):
        super().__init__(filename)
        self.uri = 'file://' + os.path.abspath(self.filename)
-        self.__scale = 2  # how much precision do we want for the render
+        self.__scale = 200 / 72.0  # how much precision do we want for the render
        try:  # Check now that the file is valid, to avoid surprises later
            Poppler.Document.new_from_file(self.uri, None)
        except GLib.GError:  # Invalid PDF
@@ -40,7 +36,10 @@ class PDFParser(abstract.AbstractParser):

    def remove_all(self) -> bool:
        if self.lightweight_cleaning is True:
-            return self.__remove_all_lightweight()
+            try:
+                return self.__remove_all_lightweight()
+            except (cairo.Error, MemoryError) as e:
+                raise RuntimeError(e)
        return self.__remove_all_thorough()

    def __remove_all_lightweight(self) -> bool:
@@ -52,6 +51,7 @@ class PDFParser(abstract.AbstractParser):

        tmp_path = tempfile.mkstemp()[1]
        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)  # resized later anyway
+        pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface

        for pagenum in range(pages_count):
@@ -80,18 +80,19 @@ class PDFParser(abstract.AbstractParser):

        _, tmp_path = tempfile.mkstemp()
        pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
+        pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
        pdf_context = cairo.Context(pdf_surface)

        for pagenum in range(pages_count):
            page = document.get_page(pagenum)
-            if page is None:
+            if page is None:  # pragma: no cover
                logging.error("Unable to get PDF pages")
                return False
            page_width, page_height = page.get_size()
            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)

-            width = int(page_width) * self.__scale
-            height = int(page_height) * self.__scale
+            width = int(page_width * self.__scale)
+            height = int(page_height * self.__scale)
            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
            img_context = cairo.Context(img_surface)

@@ -105,7 +106,11 @@ class PDFParser(abstract.AbstractParser):
            buf.seek(0)

            img = cairo.ImageSurface.create_from_png(buf)
-            pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale)
+            if cairo.version_info < (1, 12, 0):
+                pdf_surface.set_size(width, height)
+            else:
+                pdf_surface.set_size(page_width, page_height)
+                pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
            pdf_context.set_source_surface(img, 0, 0)
            pdf_context.paint()
            pdf_context.show_page()  # draw pdf_context on pdf_surface
@@ -131,8 +136,8 @@ class PDFParser(abstract.AbstractParser):
        # It should(tm) be alright though, because cairo's output format
        # for metadata is fixed.
        with open(out_file, 'rb') as f:
-            out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
-                         re.DOTALL | re.IGNORECASE)
+            out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(),
+                         count=0, flags=re.DOTALL | re.IGNORECASE)
        with open(out_file, 'wb') as f:
            f.write(out)

@@ -145,7 +150,7 @@ class PDFParser(abstract.AbstractParser):
            metadata[key] = value
        return metadata

-    def get_meta(self) -> Dict[str, Union[str, dict]]:
+    def get_meta(self) -> Dict[str, Union[str, Dict]]:
        """ Return a dict with all the meta of the file
        """
        metadata = {}
No results found