Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • fix_heic
  • master
  • 0.1.0
  • 0.1.1
  • 0.1.2
  • 0.1.3
  • 0.10.0
  • 0.10.1
  • 0.11.0
  • 0.12.0
  • 0.12.1
  • 0.12.2
  • 0.12.3
  • 0.12.4
  • 0.13.0
  • 0.13.1
  • 0.13.2
  • 0.13.3
  • 0.13.4
  • 0.13.5
  • 0.2.0
  • 0.3.0
  • 0.3.1
  • 0.4.0
  • 0.5.0
  • 0.6.0
  • 0.7.0
  • 0.8.0
  • 0.9.0
29 results

Target

Select target project
  • tguinot/mat2
  • jvoisin/mat2
  • dachary/mat2
  • mejo-/mat2
  • LogicalDash/mat2
  • dkg/mat2
  • christian/mat2
  • Selflike323/mat2
  • fz/mat2
  • iwwmidatlanticgdc/mat2
  • Gu1nn3zz/mat2
  • smagnin/mat2
  • flashcode/mat2
  • MANCASTILLEJA/mat2
  • jboursier/mat2
  • tails/mat2
  • matiargs/mat2
  • Brolf/mat2
  • madaidan/mat2
  • Delmer84/mat2
  • yuebyzua/mat2
  • yyyyyyyan/mat2
  • rmnvgr/mat2
  • Marxism-Leninism/mat2
  • GNUtoo/mat2
  • allexj/mat2
  • b068931cc450442b63f5b3d276ea4297/mat2
  • chenrui/mat2
  • nosec13346/mat2
  • anelki/mat2
30 results
Select Git revision
  • bak
  • elementary-contract
  • implement_lightweight_mode_msoffice
  • inverted_backup
  • master
  • patch-1
  • 0.1.0
  • 0.1.1
  • 0.1.2
  • 0.1.3
  • 0.10.0
  • 0.2.0
  • 0.3.0
  • 0.3.1
  • 0.4.0
  • 0.5.0
  • 0.6.0
  • 0.7.0
  • 0.8.0
  • 0.9.0
20 results
Show changes

Commits on Source 169

69 additional commits have been omitted to prevent performance issues.
variables: variables:
CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
GIT_DEPTH: "5"
GIT_STRATEGY: clone
stages: stages:
- linting - linting
...@@ -10,42 +12,22 @@ stages: ...@@ -10,42 +12,22 @@ stages:
- useradd --home-dir ${CI_PROJECT_DIR} mat2 - useradd --home-dir ${CI_PROJECT_DIR} mat2
- chown -R mat2 . - chown -R mat2 .
linting:bandit: linting:ruff:
image: $CONTAINER_REGISTRY:linting
stage: linting
script: # TODO: remove B405 and B314
- bandit ./mat2 --format txt --skip B101
- bandit -r ./nautilus/ --format txt --skip B101
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314,B108
linting:codespell:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
# Run codespell to check for spelling errors; ignore errors about binary
# files, use a config with ignored words and exclude the git directory,
# which might contain false positives
- codespell -q 2 -I utils/ci/codespell/ignored_words.txt -S .git
linting:pylint:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
# Once nautilus-python is in Debian, decomment it form the line below
- pylint --disable=no-else-return,no-else-raise,no-else-continue,unnecessary-comprehension --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/mat2.py
linting:pyflakes:
image: $CONTAINER_REGISTRY:linting image: $CONTAINER_REGISTRY:linting
stage: linting stage: linting
script: script:
- pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus - apt update
- apt install -qqy --no-install-recommends python3-venv
- python3 -m venv venv
- source venv/bin/activate
- pip3 install ruff
- ruff check .
linting:mypy: linting:mypy:
image: $CONTAINER_REGISTRY:linting image: $CONTAINER_REGISTRY:linting
stage: linting stage: linting
script: script:
- mypy --ignore-missing-imports mat2 libmat2/*.py ./nautilus/mat2.py - mypy --ignore-missing-imports mat2 libmat2/*.py
tests:archlinux: tests:archlinux:
image: $CONTAINER_REGISTRY:archlinux image: $CONTAINER_REGISTRY:archlinux
...@@ -56,17 +38,20 @@ tests:archlinux: ...@@ -56,17 +38,20 @@ tests:archlinux:
tests:debian: tests:debian:
image: $CONTAINER_REGISTRY:debian image: $CONTAINER_REGISTRY:debian
stage: test stage: test
<<: *prepare_env
script: script:
- apt-get -qqy purge bubblewrap - apt-get -qqy purge bubblewrap
- python3 -m unittest discover -v - su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
- su - mat2 -c "python3-coverage report --fail-under=95 -m --include 'libmat2/*'"
tests:debian_with_bubblewrap: tests:debian_with_bubblewrap:
image: $CONTAINER_REGISTRY:debian image: $CONTAINER_REGISTRY:debian
stage: test stage: test
allow_failure: true
<<: *prepare_env <<: *prepare_env
script: script:
- su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/" - apt-get -qqy install bubblewrap
- su - mat2 -c "python3-coverage report --fail-under=100 -m --include 'libmat2/*'" - python3 -m unittest discover -v
tests:fedora: tests:fedora:
image: $CONTAINER_REGISTRY:fedora image: $CONTAINER_REGISTRY:fedora
...@@ -80,3 +65,51 @@ tests:gentoo: ...@@ -80,3 +65,51 @@ tests:gentoo:
<<: *prepare_env <<: *prepare_env
script: script:
- su - mat2 -c "python3 -m unittest discover -v" - su - mat2 -c "python3 -m unittest discover -v"
tests:python3.7:
image: $CONTAINER_REGISTRY:python3.7
stage: test
script:
- python3 -m unittest discover -v
tests:python3.8:
image: $CONTAINER_REGISTRY:python3.8
stage: test
script:
- python3 -m unittest discover -v
tests:python3.9:
image: $CONTAINER_REGISTRY:python3.9
stage: test
script:
- python3 -m unittest discover -v
tests:python3.10:
image: $CONTAINER_REGISTRY:python3.10
stage: test
script:
- python3 -m unittest discover -v
tests:python3.11:
image: $CONTAINER_REGISTRY:python3.11
stage: test
script:
- python3 -m unittest discover -v
tests:python3.12:
image: $CONTAINER_REGISTRY:python3.12
stage: test
script:
- python3 -m unittest discover -v
tests:python3.13:
image: $CONTAINER_REGISTRY:python3.13
stage: test
script:
- python3 -m unittest discover -v
tests:python3.14:
image: $CONTAINER_REGISTRY:python3.14
stage: test
script:
- python3 -m unittest discover -v
...@@ -14,4 +14,5 @@ disable= ...@@ -14,4 +14,5 @@ disable=
catching-non-exception, catching-non-exception,
cell-var-from-loop, cell-var-from-loop,
locally-disabled, locally-disabled,
raise-missing-from,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
# 0.13.5 - 2025-01-09
- Keep orientation metadata on jpeg and tiff files
- Improve cairo-related error/exceptions handling
- Improve the logging
- Improve the sandboxing
- Improve Python3.12 support
- Improve MSOffice documents handling
# 0.13.4 - 2023-08-02
- Add documentation about mat2 on OSX
- Make use of python3.7 constructs to simplify code
- Use moderner type annotations
- Harden get_meta in archive.py against variants of CVE-2022-35410
- Improve MSOffice document support
- Package the manpage on pypi
# 0.13.3 - 2023-02-23
- Fix a decorator argument
# 0.13.2 - 2023-01-28
- Fix a crash on some python versions
# 0.13.1 - 2023-01-07
- Improve xlsx support
- Remove the Nautilus extension
# 0.13.0 - 2022-07-06
- Fix an arbitrary file read (CVE-2022-35410)
- Add support for heic files
# 0.12.4 - 2022-04-30
- Fix possible errors/crashes when processing multiple files
via the command line interface
- Use a fixed PDF version for the output
- Improve compatibility with modern versions of rsvg
- Improve the robustness of the command line interface with
regard to control characters
# 0.12.3 - 2022-01-06
- Implement code for internationalization
- Keep individual files compression type in zip files
- Increase the robustness of mat2 against weird/corrupted files
- Fix the dolphin integration
- Add a fuzzer
# 0.12.2 - 2021-08-29
- Add support for aiff files
- Improve MS Office support
- Improve compatibility with newer/older version of mat2's dependencies
- Fix possible issues with the resolution of processed pdf
# 0.12.1 - 2021-03-19
- Improve epub support
- Improve MS Office support
# 0.12.0 - 2020-12-18
- Improve significantly MS Office formats support
- Fix some typos in the Nautilus extension
- Improve reliability of the mp3, pdf and svg parsers
- Improve compatibility with ffmpeg when sandboxing is used
- Improve the dolphin extension usability
- libmat2 now raises a ValueError on malformed files while trying to
find the right parser, instead of returning None
# 0.11.0 - 2020-03-29
- Improve significantly MS Office formats support
- Refactor how mat2 looks for executables
# 0.10.1 - 2020-02-09
- Improve the documentation and the manpage
- Improve the robustness of css, html, png, gdk-based, exiftool-based parsers
- Future-proof a bit the testsuite
- Handle tiff files with a .tif extension
- Improve the sandbox' usability
- Add support for wav files
# 0.10.0 - 2019-11-30 # 0.10.0 - 2019-11-30
- Make mat2 work on Python3.8 - Make mat2 work on Python3.8
......
...@@ -4,8 +4,14 @@ The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ), ...@@ -4,8 +4,14 @@ The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer. but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues ) Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
and to send a pull-request. Please do check that everything is fine by running the and to send a pull-request.
testsuite with `python3 -m unittest discover -v` before submitting one :)
Before sending the pull-request, please do check that everything is fine by
running the full test suite in GitLab. To do that, after forking mat2 in GitLab,
you need to go in Settings -> CI/CD -> Runner and there enable shared runners.
Mat2 also has unit tests (that are also run in the full test suite). You can run
them with `python3 -m unittest discover -v`.
If you're fixing a bug or adding a new feature, please add tests accordingly, If you're fixing a bug or adding a new feature, please add tests accordingly,
this will greatly improve the odds of your merge-request getting merged. this will greatly improve the odds of your merge-request getting merged.
...@@ -24,15 +30,16 @@ Since mat2 is written in Python3, please conform as much as possible to the ...@@ -24,15 +30,16 @@ Since mat2 is written in Python3, please conform as much as possible to the
1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md) 1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file 2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file 3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1) 4. Update the version in the [pyproject.toml](https://0xacab.org/jvoisin/mat2/blob/master/yproject.toml) file
5. Commit the changelog, man page, mat2 and setup.py files 5. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
6. Create a tag with `git tag -s $VERSION` 6. Commit the modified files
7. Push the commit with `git push origin master` 7. Create a tag with `git tag -s $VERSION`
8. Push the tag with `git push --tags` 8. Push the commit with `git push origin master`
9. Download the gitlab archive of the release 9. Push the tag with `git push --tags`
10. Diff it against the local copy 10. Download the gitlab archive of the release
11. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz` 11. Diff it against the local copy
12. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there 12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
13. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev) 13. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
14. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*` 14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
15. Do the secret release dance 15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
16. Do the secret release dance
...@@ -18,34 +18,53 @@ installed, mat2 uses it to sandbox any external processes it invokes. ...@@ -18,34 +18,53 @@ installed, mat2 uses it to sandbox any external processes it invokes.
## Arch Linux ## Arch Linux
Thanks to [Francois_B](https://www.sciunto.org/), there is an package available on Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
[Arch linux's AUR](https://aur.archlinux.org/packages/mat2/). [Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).
## Debian ## Debian
There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all). There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all) and you can install mat2 with:
```
apt install mat2
```
## Fedora ## Fedora
Thanks to [atenart](https://ack.tf/), there is a package available on Thanks to [atenart](https://ack.tf/), there is a package available on
[Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ). [Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
We use copr (cool other packages repo) as the Mat2 Nautilus plugin depends on First you need to enable mat2's copr:
python3-nautilus, which isn't available yet in Fedora (but is distributed
through this copr).
First you need to enable Mat2's copr:
``` ```
dnf -y copr enable atenart/mat2 dnf -y copr enable atenart/mat2
``` ```
Then you can install both the Mat2 command and Nautilus extension: Then you can install mat2:
``` ```
dnf -y install mat2 mat2-nautilus dnf -y install mat2
``` ```
## Gentoo ## Gentoo
mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay). mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
# OSX
## Homebrew
mat2 is [available on homebrew](https://formulae.brew.sh/formula/mat2):
```
brew install mat2
```
## MacPorts
mat2 is [available on MacPorts](https://ports.macports.org/port/mat2/):
```
port install mat2
```
``` ```
_____ _____ _____ ___ _____ _____ _____ ___
| | _ |_ _|_ | Keep your data, | | _ |_ _|_ | Keep your data,
| | | | | | | | _| trash your meta! | | | | |_| | | | | _| trash your meta!
|_|_|_|__|__| |_| |___| |_|_|_|_| |_| |_| |___|
``` ```
This software is currently in **beta**, please don't use it for anything
critical.
# Metadata and privacy # Metadata and privacy
Metadata consist of information that characterizes data. Metadata consist of information that characterizes data.
...@@ -25,9 +22,14 @@ Maybe you don't want to disclose those information. ...@@ -25,9 +22,14 @@ Maybe you don't want to disclose those information.
This is precisely the job of mat2: getting rid, as much as possible, of This is precisely the job of mat2: getting rid, as much as possible, of
metadata. metadata.
mat2 provides a command line tool, and graphical user interfaces via a service mat2 provides:
menu for Dolphin, the default file manager of KDE, and an extension for - a library called `libmat2`;
Nautilus, the default file manager of GNOME. - a command line tool called `mat2`,
- a service menu for Dolphin, KDE's default file manager
If you prefer a regular graphical user interface, you might be interested in
[Metadata Cleaner](https://metadatacleaner.romainvigier.fr/), which is using
`mat2` under the hood.
# Requirements # Requirements
...@@ -41,6 +43,12 @@ Nautilus, the default file manager of GNOME. ...@@ -41,6 +43,12 @@ Nautilus, the default file manager of GNOME.
Please note that mat2 requires at least Python3.5. Please note that mat2 requires at least Python3.5.
# Requirements setup on macOS (OS X) using [Homebrew](https://brew.sh/)
```bash
brew install exiftool cairo pygobject3 poppler gdk-pixbuf librsvg ffmpeg
```
# Running the test suite # Running the test suite
```bash ```bash
...@@ -74,7 +82,7 @@ optional arguments: ...@@ -74,7 +82,7 @@ optional arguments:
(policy should be one of: abort, omit, keep) [Default: (policy should be one of: abort, omit, keep) [Default:
abort] abort]
--inplace clean in place, without backup --inplace clean in place, without backup
--no-sandbox Disable bubblewrap's sandboxing. --no-sandbox Disable bubblewrap's sandboxing
-v, --version show program's version number and exit -v, --version show program's version number and exit
-l, --list list all supported fileformats -l, --list list all supported fileformats
--check-dependencies check if mat2 has all the dependencies it needs --check-dependencies check if mat2 has all the dependencies it needs
...@@ -87,6 +95,26 @@ Note that mat2 **will not** clean files in-place, but will produce, for ...@@ -87,6 +95,26 @@ Note that mat2 **will not** clean files in-place, but will produce, for
example, with a file named "myfile.png" a cleaned version named example, with a file named "myfile.png" a cleaned version named
"myfile.cleaned.png". "myfile.cleaned.png".
## Web interface
It's possible to run mat2 as a web service, via
[mat2-web](https://0xacab.org/jvoisin/mat2-web).
If you're using WordPress, you might be interested in [wp-mat](https://git.autistici.org/noblogs/wp-mat)
and [wp-mat-server](https://git.autistici.org/noblogs/wp-mat-server).
## Desktop GUI
For GNU/Linux desktops, it's possible to use the
[Metadata Cleaner](https://gitlab.com/rmnvgr/metadata-cleaner) GTK application.
# Supported formats
The following formats are supported: avi, bmp, css, epub/ncx, flac, gif, jpeg,
m4a/mp2/mp3/…, mp4, odc/odf/odg/odi/odp/ods/odt/…, off/opus/oga/spx/…, pdf,
png, ppm, pptx/xlsx/docx/…, svg/svgz/…, tar/tar.gz/tar.bz2/tar.xz/…, tiff,
torrent, wav, wmv, zip, …
# Notes about detecting metadata # Notes about detecting metadata
While mat2 is doing its very best to display metadata when the `--show` flag is While mat2 is doing its very best to display metadata when the `--show` flag is
...@@ -116,15 +144,21 @@ of the guarantee that mat2 won't modify the data of their files, there is the ...@@ -116,15 +144,21 @@ of the guarantee that mat2 won't modify the data of their files, there is the
watermarks from PDF. watermarks from PDF.
- [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/), - [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
an open-source Android application to remove metadata from pictures. an open-source Android application to remove metadata from pictures.
- [Dangerzone](https://dangerzone.rocks/), designed to sanitize harmful documents
into harmless ones.
# Contact # Contact
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues) If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
or the [mailing list](https://mailman.boum.org/listinfo/mat-dev) or the [mailing list](https://www.autistici.org/mailman/listinfo/mat-dev)
Should a more private contact be needed (eg. for reporting security issues), Should a more private contact be needed (eg. for reporting security issues),
you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`, you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`. using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
# Donations
If you want to donate some money, please give it to [Tails]( https://tails.boum.org/donate/?r=contribute ).
# License # License
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
...@@ -146,6 +180,8 @@ Copyright 2016 Marie-Rose for mat2's logo ...@@ -146,6 +180,8 @@ Copyright 2016 Marie-Rose for mat2's logo
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
The `narrated_powerpoint_presentation.pptx` file is in the public domain.
# Thanks # Thanks
mat2 wouldn't exist without: mat2 wouldn't exist without:
...@@ -155,4 +191,3 @@ mat2 wouldn't exist without: ...@@ -155,4 +191,3 @@ mat2 wouldn't exist without:
- friends - friends
Many thanks to them! Many thanks to them!
.TH mat2 "1" "November 2019" "mat2 0.10.0" "User Commands" .TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"
.SH NAME .SH NAME
mat2 \- the metadata anonymisation toolkit 2 mat2 \- the metadata anonymisation toolkit 2
...@@ -71,6 +71,14 @@ complex file formats. ...@@ -71,6 +71,14 @@ complex file formats.
.PP .PP
This is why you shouldn't rely on metadata's presence to decide if your file must This is why you shouldn't rely on metadata's presence to decide if your file must
be cleaned or not. be cleaned or not.
.PP
Moreover, mat2 goes to great lengths to make sure that as much metadata as
possible are removed. This might sometimes result in a loss of quality of the
processed files. For example, textual based pdf file converted into image based
one means that it'll be no longer possible to select text in them. If you're
experiencing this, you might want to give the lightweight cleaning mode a try,
but keep in mind by doing so, some metadata \fBwon't be cleaned\fR.
.SH BUGS .SH BUGS
......
...@@ -6,6 +6,8 @@ Type=Service ...@@ -6,6 +6,8 @@ Type=Service
[Desktop Action cleanMetadata] [Desktop Action cleanMetadata]
Name=Clean metadata Name=Clean metadata
Name[de]=Metadaten löschen
Name[es]=Limpiar metadatos Name[es]=Limpiar metadatos
Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
Exec=kdialog --yesno "$( mat2 -s %U )" --title "Clean Metadata?" && mat2 %U Exec=kdialog --yesno "$( mat2 -s %F )" --title "Clean Metadata?" && mat2 %U
Exec[de]=kdialog --yesno "$( mat2 -s %F )" --title "Metadaten löschen?" && mat2 %U
...@@ -2,15 +2,10 @@ ...@@ -2,15 +2,10 @@
import enum import enum
import importlib import importlib
from typing import Dict, Optional, Union from typing import Dict
from . import exiftool, video from . import exiftool, video
# make pyflakes happy
assert Dict
assert Optional
assert Union
# A set of extension that aren't supported, despite matching a supported mimetype # A set of extension that aren't supported, despite matching a supported mimetype
UNSUPPORTED_EXTENSIONS = { UNSUPPORTED_EXTENSIONS = {
'.asc', '.asc',
...@@ -67,8 +62,9 @@ CMD_DEPENDENCIES = { ...@@ -67,8 +62,9 @@ CMD_DEPENDENCIES = {
}, },
} }
def check_dependencies() -> Dict[str, Dict[str, bool]]: def check_dependencies() -> Dict[str, Dict[str, bool]]:
ret = dict() # type: Dict[str, dict] ret: Dict[str, Dict] = dict()
for key, value in DEPENDENCIES.items(): for key, value in DEPENDENCIES.items():
ret[key] = { ret[key] = {
......
import abc import abc
import os import os
import re import re
from typing import Set, Dict, Union from typing import Union, Set, Dict
assert Set # make pyflakes happy
class AbstractParser(abc.ABC): class AbstractParser(abc.ABC):
...@@ -11,8 +9,8 @@ class AbstractParser(abc.ABC): ...@@ -11,8 +9,8 @@ class AbstractParser(abc.ABC):
It might yield `ValueError` on instantiation on invalid files, It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`. and `RuntimeError` when something went wrong in `remove_all`.
""" """
meta_list = set() # type: Set[str] meta_list: Set[str] = set()
mimetypes = set() # type: Set[str] mimetypes: Set[str] = set()
def __init__(self, filename: str) -> None: def __init__(self, filename: str) -> None:
""" """
...@@ -35,8 +33,11 @@ class AbstractParser(abc.ABC): ...@@ -35,8 +33,11 @@ class AbstractParser(abc.ABC):
self.sandbox = True self.sandbox = True
@abc.abstractmethod @abc.abstractmethod
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
"""Return all the metadata of the current file""" """Return all the metadata of the current file
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
@abc.abstractmethod @abc.abstractmethod
def remove_all(self) -> bool: def remove_all(self) -> bool:
......
...@@ -7,14 +7,10 @@ import tempfile ...@@ -7,14 +7,10 @@ import tempfile
import os import os
import logging import logging
import shutil import shutil
from typing import Dict, Set, Pattern, Union, Any, List from typing import Pattern, Union, Any, Set, Dict, List
from . import abstract, UnknownMemberPolicy, parser_factory from . import abstract, UnknownMemberPolicy, parser_factory
# Make pyflakes happy
assert Set
assert Pattern
# pylint: disable=not-callable,assignment-from-no-return,too-many-branches # pylint: disable=not-callable,assignment-from-no-return,too-many-branches
# An ArchiveClass is a class representing an archive, # An ArchiveClass is a class representing an archive,
...@@ -53,15 +49,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -53,15 +49,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Those are the files that have a format that _isn't_ # Those are the files that have a format that _isn't_
# supported by mat2, but that we want to keep anyway. # supported by mat2, but that we want to keep anyway.
self.files_to_keep = set() # type: Set[Pattern] self.files_to_keep: Set[Pattern] = set()
# Those are the files that we _do not_ want to keep, # Those are the files that we _do not_ want to keep,
# no matter if they are supported or not. # no matter if they are supported or not.
self.files_to_omit = set() # type: Set[Pattern] self.files_to_omit: Set[Pattern] = set()
# what should the parser do if it encounters an unknown file in # what should the parser do if it encounters an unknown file in
# the archive? # the archive?
self.unknown_member_policy = UnknownMemberPolicy.ABORT # type: UnknownMemberPolicy self.unknown_member_policy: UnknownMemberPolicy = UnknownMemberPolicy.ABORT
# The LGTM comment is to mask a false-positive, # The LGTM comment is to mask a false-positive,
# see https://lgtm.com/projects/g/jvoisin/mat2/ # see https://lgtm.com/projects/g/jvoisin/mat2/
...@@ -73,15 +69,22 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -73,15 +69,22 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def _specific_cleanup(self, full_path: str) -> bool: def _specific_cleanup(self, full_path: str) -> bool:
""" This method can be used to apply specific treatment """ This method can be used to apply specific treatment
to files present in the archive.""" to files present in the archive."""
# pylint: disable=unused-argument,no-self-use # pylint: disable=unused-argument
return True # pragma: no cover return True # pragma: no cover
def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]: def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
""" This method can be used to extract specific metadata """ This method can be used to extract specific metadata
from files present in the archive.""" from files present in the archive."""
# pylint: disable=unused-argument,no-self-use # pylint: disable=unused-argument
return {} # pragma: no cover return {} # pragma: no cover
def _final_checks(self) -> bool:
""" This method is invoked after the file has been cleaned,
allowing to run final verifications.
"""
# pylint: disable=unused-argument
return True
@staticmethod @staticmethod
@abc.abstractmethod @abc.abstractmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
...@@ -102,6 +105,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -102,6 +105,11 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
def _get_member_name(member: ArchiveMember) -> str: def _get_member_name(member: ArchiveMember) -> str:
"""Return the name of the given member.""" """Return the name of the given member."""
@staticmethod
@abc.abstractmethod
def _is_dir(member: ArchiveMember) -> bool:
"""Return true is the given member is a directory."""
@abc.abstractmethod @abc.abstractmethod
def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember, def _add_file_to_archive(self, archive: ArchiveClass, member: ArchiveMember,
full_path: str): full_path: str):
...@@ -113,8 +121,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -113,8 +121,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument # pylint: disable=unused-argument
return member return member
def get_meta(self) -> Dict[str, Union[str, dict]]: @staticmethod
meta = dict() # type: Dict[str, Union[str, dict]] def _get_member_compression(member: ArchiveMember):
"""Get the compression of the archive member."""
# pylint: disable=unused-argument
return None
@staticmethod
def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
"""Set the compression of the archive member."""
# pylint: disable=unused-argument
return member
def get_meta(self) -> Dict[str, Union[str, Dict]]:
meta: Dict[str, Union[str, Dict]] = dict()
with self.archive_class(self.filename) as zin: with self.archive_class(self.filename) as zin:
temp_folder = tempfile.mkdtemp() temp_folder = tempfile.mkdtemp()
...@@ -123,12 +143,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -123,12 +143,20 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
local_meta = self._get_member_meta(item) local_meta = self._get_member_meta(item)
member_name = self._get_member_name(item) member_name = self._get_member_name(item)
if member_name[-1] == '/': # pragma: no cover if self._is_dir(item): # pragma: no cover
# `is_dir` is added in Python3.6
continue # don't keep empty folders continue # don't keep empty folders
zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, member_name) full_path = os.path.join(temp_folder, member_name)
if not os.path.abspath(full_path).startswith(temp_folder):
logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
self.filename, member_name, full_path)
break
try:
zin.extract(member=item, path=temp_folder)
except OSError as e:
logging.error("Unable to extraxt %s from %s: %s", item, self.filename, e)
os.chmod(full_path, stat.S_IRUSR) os.chmod(full_path, stat.S_IRUSR)
specific_meta = self._specific_get_meta(full_path, member_name) specific_meta = self._specific_get_meta(full_path, member_name)
...@@ -136,6 +164,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -136,6 +164,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
member_parser, _ = parser_factory.get_parser(full_path) # type: ignore member_parser, _ = parser_factory.get_parser(full_path) # type: ignore
if member_parser: if member_parser:
member_parser.sandbox = self.sandbox
local_meta = {**local_meta, **member_parser.get_meta()} local_meta = {**local_meta, **member_parser.get_meta()}
if local_meta: if local_meta:
...@@ -155,12 +184,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -155,12 +184,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# Sort the items to process, to reduce fingerprinting, # Sort the items to process, to reduce fingerprinting,
# and keep them in the `items` variable. # and keep them in the `items` variable.
items = list() # type: List[ArchiveMember] items: List[ArchiveMember] = list()
for item in sorted(self._get_all_members(zin), key=self._get_member_name): for item in sorted(self._get_all_members(zin), key=self._get_member_name):
# Some fileformats do require to have the `mimetype` file # Some fileformats do require to have the `mimetype` file
# as the first file in the archive. # as the first file in the archive.
if self._get_member_name(item) == 'mimetype': if self._get_member_name(item) == 'mimetype':
items = [item] + items items.insert(0, item)
else: else:
items.append(item) items.append(item)
...@@ -168,18 +197,36 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -168,18 +197,36 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# we're iterating (and thus inserting) them in lexicographic order. # we're iterating (and thus inserting) them in lexicographic order.
for item in items: for item in items:
member_name = self._get_member_name(item) member_name = self._get_member_name(item)
if member_name[-1] == '/': # `is_dir` is added in Python3.6 if self._is_dir(item):
continue # don't keep empty folders continue # don't keep empty folders
zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, member_name) full_path = os.path.join(temp_folder, member_name)
if not os.path.abspath(full_path).startswith(temp_folder):
logging.error("%s contains a file (%s) pointing outside (%s) of its root.",
self.filename, member_name, full_path)
abort = True
break
zin.extract(member=item, path=temp_folder)
try:
original_permissions = os.stat(full_path).st_mode original_permissions = os.stat(full_path).st_mode
except FileNotFoundError:
logging.error("Something went wrong during processing of "
"%s in %s, likely a path traversal attack.",
member_name, self.filename)
abort = True
# we're breaking instead of continuing, because this exception
# is raised in case of weird path-traversal-like atttacks.
break
os.chmod(full_path, original_permissions | stat.S_IWUSR | stat.S_IRUSR) os.chmod(full_path, original_permissions | stat.S_IWUSR | stat.S_IRUSR)
original_compression = self._get_member_compression(item)
if self._specific_cleanup(full_path) is False: if self._specific_cleanup(full_path) is False:
logging.warning("Something went wrong during deep cleaning of %s", logging.warning("Something went wrong during deep cleaning of %s in %s",
member_name) member_name, self.filename)
abort = True abort = True
continue continue
...@@ -205,6 +252,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -205,6 +252,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
abort = True abort = True
continue continue
else: else:
member_parser.sandbox = self.sandbox
if member_parser.remove_all() is False: if member_parser.remove_all() is False:
logging.warning("In file %s, something went wrong \ logging.warning("In file %s, something went wrong \
with the cleaning of %s \ with the cleaning of %s \
...@@ -216,6 +264,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -216,6 +264,7 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
zinfo = self.member_class(member_name) # type: ignore zinfo = self.member_class(member_name) # type: ignore
zinfo = self._set_member_permissions(zinfo, original_permissions) zinfo = self._set_member_permissions(zinfo, original_permissions)
zinfo = self._set_member_compression(zinfo, original_compression)
clean_zinfo = self._clean_member(zinfo) clean_zinfo = self._clean_member(zinfo)
self._add_file_to_archive(zout, clean_zinfo, full_path) self._add_file_to_archive(zout, clean_zinfo, full_path)
...@@ -223,11 +272,14 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -223,11 +272,14 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
if abort: if abort:
os.remove(self.output_filename) os.remove(self.output_filename)
return False return False
if not self._final_checks():
return False # pragma: no cover
return True return True
class TarParser(ArchiveBasedAbstractParser): class TarParser(ArchiveBasedAbstractParser):
mimetypes = {'application/x-tar'} mimetypes = {'application/x-tar'}
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
# yes, it's tarfile.open and not tarfile.TarFile, # yes, it's tarfile.open and not tarfile.TarFile,
...@@ -337,6 +389,11 @@ class TarParser(ArchiveBasedAbstractParser): ...@@ -337,6 +389,11 @@ class TarParser(ArchiveBasedAbstractParser):
member.mode = permissions member.mode = permissions
return member return member
@staticmethod
def _is_dir(member: ArchiveMember) -> bool:
assert isinstance(member, tarfile.TarInfo) # please mypy
return member.isdir()
class TarGzParser(TarParser): class TarGzParser(TarParser):
compression = ':gz' compression = ':gz'
...@@ -355,16 +412,17 @@ class TarXzParser(TarParser): ...@@ -355,16 +412,17 @@ class TarXzParser(TarParser):
class ZipParser(ArchiveBasedAbstractParser): class ZipParser(ArchiveBasedAbstractParser):
mimetypes = {'application/zip'} mimetypes = {'application/zip'}
def __init__(self, filename):
def __init__(self, filename: str):
super().__init__(filename) super().__init__(filename)
self.archive_class = zipfile.ZipFile self.archive_class = zipfile.ZipFile
self.member_class = zipfile.ZipInfo self.member_class = zipfile.ZipInfo
self.zip_compression_type = zipfile.ZIP_DEFLATED
def is_archive_valid(self): def is_archive_valid(self):
try: try:
zipfile.ZipFile(self.filename) with zipfile.ZipFile(self.filename):
except zipfile.BadZipFile: pass
except (zipfile.BadZipFile, OSError):
raise ValueError raise ValueError
@staticmethod @staticmethod
...@@ -400,7 +458,7 @@ class ZipParser(ArchiveBasedAbstractParser): ...@@ -400,7 +458,7 @@ class ZipParser(ArchiveBasedAbstractParser):
assert isinstance(member, zipfile.ZipInfo) # please mypy assert isinstance(member, zipfile.ZipInfo) # please mypy
with open(full_path, 'rb') as f: with open(full_path, 'rb') as f:
archive.writestr(member, f.read(), archive.writestr(member, f.read(),
compress_type=self.zip_compression_type) compress_type=member.compress_type)
@staticmethod @staticmethod
def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]: def _get_all_members(archive: ArchiveClass) -> List[ArchiveMember]:
...@@ -411,3 +469,19 @@ class ZipParser(ArchiveBasedAbstractParser): ...@@ -411,3 +469,19 @@ class ZipParser(ArchiveBasedAbstractParser):
def _get_member_name(member: ArchiveMember) -> str: def _get_member_name(member: ArchiveMember) -> str:
assert isinstance(member, zipfile.ZipInfo) # please mypy assert isinstance(member, zipfile.ZipInfo) # please mypy
return member.filename return member.filename
@staticmethod
def _get_member_compression(member: ArchiveMember):
assert isinstance(member, zipfile.ZipInfo) # please mypy
return member.compress_type
@staticmethod
def _set_member_compression(member: ArchiveMember, compression) -> ArchiveMember:
assert isinstance(member, zipfile.ZipInfo) # please mypy
member.compress_type = compression
return member
@staticmethod
def _is_dir(member: ArchiveMember) -> bool:
assert isinstance(member, zipfile.ZipInfo) # please mypy
return member.is_dir()
...@@ -2,42 +2,51 @@ import mimetypes ...@@ -2,42 +2,51 @@ import mimetypes
import os import os
import shutil import shutil
import tempfile import tempfile
from typing import Dict, Union from typing import Union, Dict
import mutagen import mutagen
from . import abstract, parser_factory from . import abstract, parser_factory, video
class MutagenParser(abstract.AbstractParser): class MutagenParser(abstract.AbstractParser):
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
try: try:
mutagen.File(self.filename) if mutagen.File(self.filename) is None:
raise ValueError
except mutagen.MutagenError: except mutagen.MutagenError:
raise ValueError raise ValueError
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
f = mutagen.File(self.filename) f = mutagen.File(self.filename)
if f.tags: if f.tags:
return {k:', '.join(v) for k, v in f.tags.items()} return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
return {} return {}
def remove_all(self) -> bool: def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename) shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename) f = mutagen.File(self.output_filename)
try:
f.delete() f.delete()
f.save() f.save()
except mutagen.MutagenError:
raise ValueError
return True return True
class MP3Parser(MutagenParser): class MP3Parser(MutagenParser):
mimetypes = {'audio/mpeg', } mimetypes = {'audio/mpeg', }
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
metadata = {} # type: Dict[str, Union[str, dict]] metadata: Dict[str, Union[str, Dict]] = dict()
meta = mutagen.File(self.filename).tags meta = mutagen.File(self.filename).tags
if not meta:
return metadata
for key in meta: for key in meta:
if isinstance(key, tuple):
metadata[key[0]] = key[1]
continue
if not hasattr(meta[key], 'text'): # pragma: no cover if not hasattr(meta[key], 'text'): # pragma: no cover
continue continue
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text)) metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
...@@ -59,7 +68,7 @@ class FLACParser(MutagenParser): ...@@ -59,7 +68,7 @@ class FLACParser(MutagenParser):
f.save(deleteid3=True) f.save(deleteid3=True)
return True return True
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
meta = super().get_meta() meta = super().get_meta()
for num, picture in enumerate(mutagen.File(self.filename).pictures): for num, picture in enumerate(mutagen.File(self.filename).pictures):
name = picture.desc if picture.desc else 'Cover %d' % num name = picture.desc if picture.desc else 'Cover %d' % num
...@@ -73,7 +82,33 @@ class FLACParser(MutagenParser): ...@@ -73,7 +82,33 @@ class FLACParser(MutagenParser):
with open(fname, 'wb') as f: with open(fname, 'wb') as f:
f.write(picture.data) f.write(picture.data)
p, _ = parser_factory.get_parser(fname) # type: ignore p, _ = parser_factory.get_parser(fname) # type: ignore
if p is None:
raise ValueError
p.sandbox = self.sandbox
# Mypy chokes on ternaries :/ # Mypy chokes on ternaries :/
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
os.remove(fname) os.remove(fname)
return meta return meta
class WAVParser(video.AbstractFFmpegParser):
mimetypes = {'audio/x-wav', }
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
'Duration', 'Encoding', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate',
'FileModifyDate', 'FileName', 'FilePermissions',
'FileSize', 'FileType', 'FileTypeExtension',
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
}
class AIFFParser(video.AbstractFFmpegParser):
mimetypes = {'audio/aiff', 'audio/x-aiff'}
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
'Duration', 'Encoding', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate',
'FileModifyDate', 'FileName', 'FilePermissions',
'FileSize', 'FileType', 'FileTypeExtension',
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
'NumSampleFrames', 'SampleSize',
}
...@@ -11,7 +11,8 @@ import os ...@@ -11,7 +11,8 @@ import os
import shutil import shutil
import subprocess import subprocess
import tempfile import tempfile
from typing import List, Optional import functools
from typing import Optional, List
__all__ = ['PIPE', 'run', 'CalledProcessError'] __all__ = ['PIPE', 'run', 'CalledProcessError']
...@@ -21,16 +22,15 @@ CalledProcessError = subprocess.CalledProcessError ...@@ -21,16 +22,15 @@ CalledProcessError = subprocess.CalledProcessError
# pylint: disable=subprocess-run-check # pylint: disable=subprocess-run-check
@functools.lru_cache(maxsize=None)
def _get_bwrap_path() -> str: def _get_bwrap_path() -> str:
bwrap_path = '/usr/bin/bwrap' which_path = shutil.which('bwrap')
if os.path.isfile(bwrap_path): if which_path:
if os.access(bwrap_path, os.X_OK): return which_path
return bwrap_path
raise RuntimeError("Unable to find bwrap") # pragma: no cover raise RuntimeError("Unable to find bwrap") # pragma: no cover
# pylint: disable=bad-whitespace
def _get_bwrap_args(tempdir: str, def _get_bwrap_args(tempdir: str,
input_filename: str, input_filename: str,
output_filename: Optional[str] = None) -> List[str]: output_filename: Optional[str] = None) -> List[str]:
...@@ -39,7 +39,7 @@ def _get_bwrap_args(tempdir: str, ...@@ -39,7 +39,7 @@ def _get_bwrap_args(tempdir: str,
# XXX: use --ro-bind-try once all supported platforms # XXX: use --ro-bind-try once all supported platforms
# have a bubblewrap recent enough to support it. # have a bubblewrap recent enough to support it.
ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', cwd] ro_bind_dirs = ['/usr', '/lib', '/lib64', '/bin', '/sbin', '/etc/alternatives', cwd]
for bind_dir in ro_bind_dirs: for bind_dir in ro_bind_dirs:
if os.path.isdir(bind_dir): # pragma: no cover if os.path.isdir(bind_dir): # pragma: no cover
ro_bind_args.extend(['--ro-bind', bind_dir, bind_dir]) ro_bind_args.extend(['--ro-bind', bind_dir, bind_dir])
...@@ -78,7 +78,6 @@ def _get_bwrap_args(tempdir: str, ...@@ -78,7 +78,6 @@ def _get_bwrap_args(tempdir: str,
return args return args
# pylint: disable=bad-whitespace
def run(args: List[str], def run(args: List[str],
input_filename: str, input_filename: str,
output_filename: Optional[str] = None, output_filename: Optional[str] = None,
......
import logging import logging
import re import re
import uuid import uuid
import zipfile
import xml.etree.ElementTree as ET # type: ignore import xml.etree.ElementTree as ET # type: ignore
from typing import Any, Dict
from . import archive, office from . import archive, office
class EPUBParser(archive.ZipParser): class EPUBParser(archive.ZipParser):
mimetypes = {'application/epub+zip', } mimetypes = {'application/epub+zip', }
metadata_namespace = '{http://purl.org/dc/elements/1.1/}' metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
...@@ -15,11 +18,27 @@ class EPUBParser(archive.ZipParser): ...@@ -15,11 +18,27 @@ class EPUBParser(archive.ZipParser):
'META-INF/container.xml', 'META-INF/container.xml',
'mimetype', 'mimetype',
'OEBPS/content.opf', 'OEBPS/content.opf',
'content.opf',
'hmh.opf',
'OPS/.+.xml'
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
'iTunesMetadata.plist',
'META-INF/calibre_bookmarks.txt',
'OEBPS/package.opf',
})) }))
self.uniqid = uuid.uuid4() self.uniqid = uuid.uuid4()
def _specific_get_meta(self, full_path, file_path): def is_archive_valid(self):
if file_path != 'OEBPS/content.opf': super().is_archive_valid()
with zipfile.ZipFile(self.filename) as zin:
for item in self._get_all_members(zin):
member_name = self._get_member_name(item)
if member_name.endswith('META-INF/encryption.xml'):
raise ValueError('the file contains encrypted fonts')
def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
if not file_path.endswith('.opf'):
return {} return {}
with open(full_path, encoding='utf-8') as f: with open(full_path, encoding='utf-8') as f:
...@@ -30,14 +49,31 @@ class EPUBParser(archive.ZipParser): ...@@ -30,14 +49,31 @@ class EPUBParser(archive.ZipParser):
except (TypeError, UnicodeDecodeError): except (TypeError, UnicodeDecodeError):
return {file_path: 'harmful content', } return {file_path: 'harmful content', }
def _specific_cleanup(self, full_path: str): def _specific_cleanup(self, full_path: str) -> bool:
if full_path.endswith('OEBPS/content.opf'): if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
return self.__handle_contentopf(full_path) return self.__handle_contentopf(full_path)
elif full_path.endswith('OEBPS/toc.ncx'): elif full_path.endswith('OEBPS/toc.ncx'):
return self.__handle_tocncx(full_path) return self.__handle_tocncx(full_path)
elif re.search('/OPS/[^/]+.xml$', full_path):
return self.__handle_ops_xml(full_path)
return True
def __handle_ops_xml(self, full_path: str) -> bool:
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover
logging.error("Unable to parse %s in %s.", full_path, self.filename)
return False
for item in tree.iterfind('.//', namespace): # pragma: nocover
if item.tag.strip().lower().endswith('head'):
item.clear()
break
tree.write(full_path, xml_declaration=True, encoding='utf-8',
short_empty_elements=False)
return True return True
def __handle_tocncx(self, full_path: str): def __handle_tocncx(self, full_path: str) -> bool:
try: try:
tree, namespace = office._parse_xml(full_path) tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover except ET.ParseError: # pragma: nocover
...@@ -53,7 +89,7 @@ class EPUBParser(archive.ZipParser): ...@@ -53,7 +89,7 @@ class EPUBParser(archive.ZipParser):
short_empty_elements=False) short_empty_elements=False)
return True return True
def __handle_contentopf(self, full_path: str): def __handle_contentopf(self, full_path: str) -> bool:
try: try:
tree, namespace = office._parse_xml(full_path) tree, namespace = office._parse_xml(full_path)
except ET.ParseError: except ET.ParseError:
...@@ -71,7 +107,7 @@ class EPUBParser(archive.ZipParser): ...@@ -71,7 +107,7 @@ class EPUBParser(archive.ZipParser):
item.append(uniqid) item.append(uniqid)
# items without mandatory content # items without mandatory content
for name in {'language', 'title'}: for name in ['language', 'title']:
uniqid = ET.Element(self.metadata_namespace + name) uniqid = ET.Element(self.metadata_namespace + name)
item.append(uniqid) item.append(uniqid)
break # there is only a single <metadata> block break # there is only a single <metadata> block
......
...@@ -2,24 +2,22 @@ import functools ...@@ -2,24 +2,22 @@ import functools
import json import json
import logging import logging
import os import os
import shutil
import subprocess import subprocess
from typing import Dict, Union, Set from typing import Union, Set, Dict
from . import abstract from . import abstract
from . import bubblewrap from . import bubblewrap
# Make pyflakes happy
assert Set
class ExiftoolParser(abstract.AbstractParser): class ExiftoolParser(abstract.AbstractParser):
""" Exiftool is often the easiest way to get all the metadata """ Exiftool is often the easiest way to get all the metadata
from a import file, hence why several parsers are re-using its `get_meta` from a import file, hence why several parsers are re-using its `get_meta`
method. method.
""" """
meta_allowlist = set() # type: Set[str] meta_allowlist: Set[str] = set()
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
try: try:
if self.sandbox: if self.sandbox:
out = bubblewrap.run([_get_exiftool_path(), '-json', out = bubblewrap.run([_get_exiftool_path(), '-json',
...@@ -69,16 +67,14 @@ class ExiftoolParser(abstract.AbstractParser): ...@@ -69,16 +67,14 @@ class ExiftoolParser(abstract.AbstractParser):
return False return False
return True return True
@functools.lru_cache() @functools.lru_cache(maxsize=None)
def _get_exiftool_path() -> str: # pragma: no cover def _get_exiftool_path() -> str: # pragma: no cover
possible_pathes = { which_path = shutil.which('exiftool')
'/usr/bin/exiftool', # debian/fedora if which_path:
'/usr/bin/vendor_perl/exiftool', # archlinux return which_path
}
for possible_path in possible_pathes: # Exiftool on Arch Linux has a weird path
if os.path.isfile(possible_path): if os.access('/usr/bin/vendor_perl/exiftool', os.X_OK):
if os.access(possible_path, os.X_OK): return '/usr/bin/vendor_perl/exiftool'
return possible_path
raise RuntimeError("Unable to find exiftool") raise RuntimeError("Unable to find exiftool")
import shutil import shutil
from typing import Dict, Union from typing import Union, Dict
from . import abstract from . import abstract
class HarmlessParser(abstract.AbstractParser): class HarmlessParser(abstract.AbstractParser):
""" This is the parser for filetypes that can not contain metadata. """ """ This is the parser for filetypes that can not contain metadata. """
mimetypes = {'text/plain', 'image/x-ms-bmp'} mimetypes = {'text/plain', 'image/x-ms-bmp', 'image/bmp'}
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
return dict() return dict()
def remove_all(self) -> bool: def remove_all(self) -> bool:
......
import imghdr
import os import os
import re import re
from typing import Set, Dict, Union, Any from typing import Union, Any, Dict
import cairo import cairo
...@@ -12,10 +11,6 @@ from gi.repository import GdkPixbuf, GLib, Rsvg ...@@ -12,10 +11,6 @@ from gi.repository import GdkPixbuf, GLib, Rsvg
from . import exiftool, abstract from . import exiftool, abstract
# Make pyflakes happy
assert Set
assert Any
class SVGParser(exiftool.ExiftoolParser): class SVGParser(exiftool.ExiftoolParser):
mimetypes = {'image/svg+xml', } mimetypes = {'image/svg+xml', }
meta_allowlist = {'Directory', 'ExifToolVersion', 'FileAccessDate', meta_allowlist = {'Directory', 'ExifToolVersion', 'FileAccessDate',
...@@ -26,25 +21,40 @@ class SVGParser(exiftool.ExiftoolParser): ...@@ -26,25 +21,40 @@ class SVGParser(exiftool.ExiftoolParser):
} }
def remove_all(self) -> bool: def remove_all(self) -> bool:
try:
svg = Rsvg.Handle.new_from_file(self.filename) svg = Rsvg.Handle.new_from_file(self.filename)
except GLib.GError:
raise ValueError
try:
_, _, _, _, has_viewbox, viewbox = svg.get_intrinsic_dimensions()
if has_viewbox is False:
raise ValueError
_, width, height = svg.get_intrinsic_size_in_pixels()
except AttributeError:
dimensions = svg.get_dimensions() dimensions = svg.get_dimensions()
surface = cairo.SVGSurface(self.output_filename, height, width = dimensions.height, dimensions.width
dimensions.height,
dimensions.width) surface = cairo.SVGSurface(self.output_filename, height, width)
context = cairo.Context(surface) context = cairo.Context(surface)
try:
svg.render_document(context, viewbox)
except AttributeError:
svg.render_cairo(context) svg.render_cairo(context)
surface.finish() surface.finish()
return True return True
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
meta = super().get_meta() meta = super().get_meta()
# The namespace is mandatory, but only the …/2000/svg is valid. # The namespace is mandatory, but only the …/2000/svg is valid.
ns = 'http://www.w3.org/2000/svg' ns = 'http://www.w3.org/2000/svg'
if meta.get('Xmlns', ns) == ns: if meta.get('Xmlns') == ns:
meta.pop('Xmlns') meta.pop('Xmlns')
return meta return meta
class PNGParser(exiftool.ExiftoolParser): class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', } mimetypes = {'image/png', }
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
...@@ -58,12 +68,9 @@ class PNGParser(exiftool.ExiftoolParser): ...@@ -58,12 +68,9 @@ class PNGParser(exiftool.ExiftoolParser):
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
if imghdr.what(filename) != 'png':
raise ValueError
try: # better fail here than later try: # better fail here than later
cairo.ImageSurface.create_from_png(self.filename) cairo.ImageSurface.create_from_png(self.filename)
except Exception: # pragma: no cover except: # pragma: no cover
# Cairo is returning some weird exceptions :/ # Cairo is returning some weird exceptions :/
raise ValueError raise ValueError
...@@ -98,7 +105,6 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser): ...@@ -98,7 +105,6 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
# we can't use imghdr here because of https://bugs.python.org/issue28591
try: try:
GdkPixbuf.Pixbuf.new_from_file(self.filename) GdkPixbuf.Pixbuf.new_from_file(self.filename)
except GLib.GError: except GLib.GError:
...@@ -110,6 +116,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser): ...@@ -110,6 +116,7 @@ class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
_, extension = os.path.splitext(self.filename) _, extension = os.path.splitext(self.filename)
pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename) pixbuf = GdkPixbuf.Pixbuf.new_from_file(self.filename)
pixbuf = GdkPixbuf.Pixbuf.apply_embedded_orientation(pixbuf)
if extension.lower() == '.jpg': if extension.lower() == '.jpg':
extension = '.jpeg' # gdk is picky extension = '.jpeg' # gdk is picky
elif extension.lower() == '.tif': elif extension.lower() == '.tif':
...@@ -132,7 +139,7 @@ class JPGParser(GdkPixbufAbstractParser): ...@@ -132,7 +139,7 @@ class JPGParser(GdkPixbufAbstractParser):
'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample', 'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
'ColorComponents', 'EncodingProcess', 'JFIFVersion', 'ColorComponents', 'EncodingProcess', 'JFIFVersion',
'ResolutionUnit', 'XResolution', 'YCbCrSubSampling', 'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
'YResolution', 'Megapixels', 'ImageHeight'} 'YResolution', 'Megapixels', 'ImageHeight', 'Orientation'}
class TiffParser(GdkPixbufAbstractParser): class TiffParser(GdkPixbufAbstractParser):
...@@ -146,13 +153,14 @@ class TiffParser(GdkPixbufAbstractParser): ...@@ -146,13 +153,14 @@ class TiffParser(GdkPixbufAbstractParser):
'FileInodeChangeDate', 'FileModifyDate', 'FileName', 'FileInodeChangeDate', 'FileModifyDate', 'FileName',
'FilePermissions', 'FileSize', 'FileType', 'FilePermissions', 'FileSize', 'FileType',
'FileTypeExtension', 'ImageHeight', 'ImageSize', 'FileTypeExtension', 'ImageHeight', 'ImageSize',
'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile'} 'ImageWidth', 'MIMEType', 'Megapixels', 'SourceFile', 'Orientation'}
class PPMParser(abstract.AbstractParser): class PPMParser(abstract.AbstractParser):
mimetypes = {'image/x-portable-pixmap'} mimetypes = {'image/x-portable-pixmap'}
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
meta = {} # type: Dict[str, Union[str, Dict[Any, Any]]] meta: Dict[str, Union[str, Dict[Any, Any]]] = dict()
with open(self.filename) as f: with open(self.filename) as f:
for idx, line in enumerate(f): for idx, line in enumerate(f):
if line.lstrip().startswith('#'): if line.lstrip().startswith('#'):
...@@ -167,3 +175,36 @@ class PPMParser(abstract.AbstractParser): ...@@ -167,3 +175,36 @@ class PPMParser(abstract.AbstractParser):
line = re.sub(r"\s+", "", line, flags=re.UNICODE) line = re.sub(r"\s+", "", line, flags=re.UNICODE)
fout.write(line) fout.write(line)
return True return True
class HEICParser(exiftool.ExiftoolParser):
mimetypes = {'image/heic'}
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate',
'FileInodeChangeDate', 'FilePermissions', 'FileType',
'FileTypeExtension', 'MIMEType', 'MajorBrand', 'MinorVersion',
'CompatibleBrands','HandlerType', 'PrimaryItemReference',
'HEVCConfigurationVersion', 'GeneralProfileSpace',
'GeneralTierFlag', 'GeneralProfileIDC',
'GenProfileCompatibilityFlags', 'ConstraintIndicatorFlags',
'GeneralLevelIDC', 'MinSpatialSegmentationIDC',
'ParallelismType','ChromaFormat', 'BitDepthLuma', 'BitDepthChroma',
'NumTemporalLayers', 'TemporalIDNested', 'ImageWidth',
'ImageHeight', 'ImageSpatialExtent', 'ImagePixelDepth',
'AverageFrameRate', 'ConstantFrameRate', 'MediaDataSize',
'MediaDataOffset','ImageSize', 'Megapixels'}
def remove_all(self) -> bool:
return self._lightweight_cleanup()
class WEBPParser(GdkPixbufAbstractParser):
mimetypes = {'image/webp'}
meta_allowlist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate',
'FileAccessDate', "FileInodeChangeDate",
'FilePermissions', 'FileType', 'FileTypeExtension',
'MIMEType', 'ImageWidth', 'ImageSize', 'BitsPerSample',
'ColorComponents', 'EncodingProcess', 'JFIFVersion',
'ResolutionUnit', 'XResolution', 'YCbCrSubSampling',
'YResolution', 'Megapixels', 'ImageHeight', 'Orientation',
'HorizontalScale', 'VerticalScale', 'VP8Version'}
This diff is collapsed.
import logging
import glob import glob
import os import os
import mimetypes import mimetypes
import importlib import importlib
from typing import TypeVar, List, Tuple, Optional from typing import TypeVar, Optional, List, Tuple
from . import abstract, UNSUPPORTED_EXTENSIONS from . import abstract, UNSUPPORTED_EXTENSIONS
...@@ -12,6 +11,10 @@ T = TypeVar('T', bound='abstract.AbstractParser') ...@@ -12,6 +11,10 @@ T = TypeVar('T', bound='abstract.AbstractParser')
mimetypes.add_type('application/epub+zip', '.epub') mimetypes.add_type('application/epub+zip', '.epub')
mimetypes.add_type('application/x-dtbncx+xml', '.ncx') # EPUB Navigation Control XML File mimetypes.add_type('application/x-dtbncx+xml', '.ncx') # EPUB Navigation Control XML File
# This should be removed after we move to python3.10
# https://github.com/python/cpython/commit/20a5b7e986377bdfd929d7e8c4e3db5847dfdb2d
mimetypes.add_type('image/heic', '.heic')
def __load_all_parsers(): def __load_all_parsers():
""" Loads every parser in a dynamic way """ """ Loads every parser in a dynamic way """
...@@ -40,7 +43,10 @@ def _get_parsers() -> List[T]: ...@@ -40,7 +43,10 @@ def _get_parsers() -> List[T]:
def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
""" Return the appropriate parser for a given filename. """ """ Return the appropriate parser for a given filename.
:raises ValueError: Raised if the instantiation of the parser went wrong.
"""
mtype, _ = mimetypes.guess_type(filename) mtype, _ = mimetypes.guess_type(filename)
_, extension = os.path.splitext(filename) _, extension = os.path.splitext(filename)
...@@ -53,10 +59,6 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]: ...@@ -53,10 +59,6 @@ def get_parser(filename: str) -> Tuple[Optional[T], Optional[str]]:
for parser_class in _get_parsers(): # type: ignore for parser_class in _get_parsers(): # type: ignore
if mtype in parser_class.mimetypes: if mtype in parser_class.mimetypes:
try: # This instantiation might raise a ValueError on malformed files
return parser_class(filename), mtype return parser_class(filename), mtype
except ValueError as e:
logging.info("Got an exception when trying to instantiate "
"%s for %s: %s", parser_class, filename, e)
return None, mtype
return None, mtype return None, mtype
...@@ -7,8 +7,7 @@ import re ...@@ -7,8 +7,7 @@ import re
import logging import logging
import tempfile import tempfile
import io import io
from typing import Dict, Union from typing import Union, Dict
from distutils.version import LooseVersion
import cairo import cairo
import gi import gi
...@@ -17,10 +16,7 @@ from gi.repository import Poppler, GLib ...@@ -17,10 +16,7 @@ from gi.repository import Poppler, GLib
from . import abstract from . import abstract
poppler_version = Poppler.get_version() FIXED_PDF_VERSION = cairo.PDFVersion.VERSION_1_5
if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover
raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
The installed version is %s." % poppler_version) # pragma: no cover
class PDFParser(abstract.AbstractParser): class PDFParser(abstract.AbstractParser):
...@@ -32,7 +28,7 @@ class PDFParser(abstract.AbstractParser): ...@@ -32,7 +28,7 @@ class PDFParser(abstract.AbstractParser):
def __init__(self, filename): def __init__(self, filename):
super().__init__(filename) super().__init__(filename)
self.uri = 'file://' + os.path.abspath(self.filename) self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 2 # how much precision do we want for the render self.__scale = 200 / 72.0 # how much precision do we want for the render
try: # Check now that the file is valid, to avoid surprises later try: # Check now that the file is valid, to avoid surprises later
Poppler.Document.new_from_file(self.uri, None) Poppler.Document.new_from_file(self.uri, None)
except GLib.GError: # Invalid PDF except GLib.GError: # Invalid PDF
...@@ -40,7 +36,10 @@ class PDFParser(abstract.AbstractParser): ...@@ -40,7 +36,10 @@ class PDFParser(abstract.AbstractParser):
def remove_all(self) -> bool: def remove_all(self) -> bool:
if self.lightweight_cleaning is True: if self.lightweight_cleaning is True:
try:
return self.__remove_all_lightweight() return self.__remove_all_lightweight()
except (cairo.Error, MemoryError) as e:
raise RuntimeError(e)
return self.__remove_all_thorough() return self.__remove_all_thorough()
def __remove_all_lightweight(self) -> bool: def __remove_all_lightweight(self) -> bool:
...@@ -52,6 +51,7 @@ class PDFParser(abstract.AbstractParser): ...@@ -52,6 +51,7 @@ class PDFParser(abstract.AbstractParser):
tmp_path = tempfile.mkstemp()[1] tmp_path = tempfile.mkstemp()[1]
pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
pdf_context = cairo.Context(pdf_surface) # context draws on the surface pdf_context = cairo.Context(pdf_surface) # context draws on the surface
for pagenum in range(pages_count): for pagenum in range(pages_count):
...@@ -80,15 +80,19 @@ class PDFParser(abstract.AbstractParser): ...@@ -80,15 +80,19 @@ class PDFParser(abstract.AbstractParser):
_, tmp_path = tempfile.mkstemp() _, tmp_path = tempfile.mkstemp()
pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway
pdf_surface.restrict_to_version(FIXED_PDF_VERSION)
pdf_context = cairo.Context(pdf_surface) pdf_context = cairo.Context(pdf_surface)
for pagenum in range(pages_count): for pagenum in range(pages_count):
page = document.get_page(pagenum) page = document.get_page(pagenum)
if page is None: # pragma: no cover
logging.error("Unable to get PDF pages")
return False
page_width, page_height = page.get_size() page_width, page_height = page.get_size()
logging.info("Rendering page %d/%d", pagenum + 1, pages_count) logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
width = int(page_width) * self.__scale width = int(page_width * self.__scale)
height = int(page_height) * self.__scale height = int(page_height * self.__scale)
img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height) img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
img_context = cairo.Context(img_surface) img_context = cairo.Context(img_surface)
...@@ -102,7 +106,11 @@ class PDFParser(abstract.AbstractParser): ...@@ -102,7 +106,11 @@ class PDFParser(abstract.AbstractParser):
buf.seek(0) buf.seek(0)
img = cairo.ImageSurface.create_from_png(buf) img = cairo.ImageSurface.create_from_png(buf)
pdf_surface.set_size(page_width*self.__scale, page_height*self.__scale) if cairo.version_info < (1, 12, 0):
pdf_surface.set_size(width, height)
else:
pdf_surface.set_size(page_width, page_height)
pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
pdf_context.set_source_surface(img, 0, 0) pdf_context.set_source_surface(img, 0, 0)
pdf_context.paint() pdf_context.paint()
pdf_context.show_page() # draw pdf_context on pdf_surface pdf_context.show_page() # draw pdf_context on pdf_surface
...@@ -122,6 +130,17 @@ class PDFParser(abstract.AbstractParser): ...@@ -122,6 +130,17 @@ class PDFParser(abstract.AbstractParser):
document.set_creator('') document.set_creator('')
document.set_creation_date(-1) document.set_creation_date(-1)
document.save('file://' + os.path.abspath(out_file)) document.save('file://' + os.path.abspath(out_file))
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
# fails to remove them, we have to use this terrible regex.
# It should(tm) be alright though, because cairo's output format
# for metadata is fixed.
with open(out_file, 'rb') as f:
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(),
count=0, flags=re.DOTALL | re.IGNORECASE)
with open(out_file, 'wb') as f:
f.write(out)
return True return True
@staticmethod @staticmethod
...@@ -131,7 +150,7 @@ class PDFParser(abstract.AbstractParser): ...@@ -131,7 +150,7 @@ class PDFParser(abstract.AbstractParser):
metadata[key] = value metadata[key] = value
return metadata return metadata
def get_meta(self) -> Dict[str, Union[str, dict]]: def get_meta(self) -> Dict[str, Union[str, Dict]]:
""" Return a dict with all the meta of the file """ Return a dict with all the meta of the file
""" """
metadata = {} metadata = {}
......