Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tguinot/mat2
  • jvoisin/mat2
  • dachary/mat2
  • mejo-/mat2
  • LogicalDash/mat2
  • dkg/mat2
  • christian/mat2
  • Selflike323/mat2
  • fz/mat2
  • iwwmidatlanticgdc/mat2
  • Gu1nn3zz/mat2
  • smagnin/mat2
  • flashcode/mat2
  • MANCASTILLEJA/mat2
  • jboursier/mat2
  • tails/mat2
  • matiargs/mat2
  • Brolf/mat2
  • madaidan/mat2
  • Delmer84/mat2
  • yuebyzua/mat2
  • yyyyyyyan/mat2
  • rmnvgr/mat2
  • Marxism-Leninism/mat2
  • GNUtoo/mat2
  • allexj/mat2
  • b068931cc450442b63f5b3d276ea4297/mat2
  • chenrui/mat2
  • nosec13346/mat2
  • anelki/mat2
30 results
Show changes
Commits on Source (414)
.*
*.pyc
.coverage
.eggs
.mypy_cache/
build
dist
mat2.egg-info
tags
image: debian
variables:
CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
GIT_DEPTH: "5"
GIT_STRATEGY: clone
stages:
- linting
- test
bandit:
stage: linting
script: # TODO: remove B405 and B314
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-bandit
- bandit ./mat2 --format txt
- bandit -r ./nautilus/ --format txt --skip B101
- bandit -r ./libmat2 --format txt --skip B101,B404,B603,B405,B314
pylint:
.prepare_env: &prepare_env
before_script: # This is needed to not run the testsuite as root
- useradd --home-dir ${CI_PROJECT_DIR} mat2
- chown -R mat2 .
linting:ruff:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends pylint3 python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0
- pylint3 --extension-pkg-whitelist=cairo,gi ./libmat2 ./mat2
# Once nautilus-python is in Debian, decomment it form the line below
- pylint3 --extension-pkg-whitelist=Nautilus,GObject,Gtk,Gio,GLib,gi ./nautilus/nautilus_mat2.py
- apt update
- apt install -qqy --no-install-recommends python3-venv
- python3 -m venv venv
- source venv/bin/activate
- pip3 install ruff
- ruff check .
pyflakes:
linting:mypy:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends pyflakes3
- pyflakes3 ./libmat2 ./mat2 ./tests/ ./nautilus
- mypy --ignore-missing-imports mat2 libmat2/*.py
mypy:
stage: linting
tests:archlinux:
image: $CONTAINER_REGISTRY:archlinux
stage: test
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-pip
- pip3 install mypy
- mypy mat2 libmat2/*.py --ignore-missing-imports
- mypy --ignore-missing-imports ./nautilus/nautilus_mat2.py
- python3 -m unittest discover -v
tests:debian:
image: $CONTAINER_REGISTRY:debian
stage: test
<<: *prepare_env
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report -m --include 'libmat2/*'
- apt-get -qqy purge bubblewrap
- su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
- su - mat2 -c "python3-coverage report --fail-under=95 -m --include 'libmat2/*'"
tests:debian_with_bubblewrap:
image: $CONTAINER_REGISTRY:debian
stage: test
allow_failure: true
<<: *prepare_env
script:
- apt-get -qqy install bubblewrap
- python3 -m unittest discover -v
tests:fedora:
image: fedora
image: $CONTAINER_REGISTRY:fedora
stage: test
script:
- python3 -m unittest discover -v
tests:gentoo:
image: $CONTAINER_REGISTRY:gentoo
stage: test
<<: *prepare_env
script:
- su - mat2 -c "python3 -m unittest discover -v"
tests:python3.7:
image: $CONTAINER_REGISTRY:python3.7
stage: test
script:
- python3 -m unittest discover -v
tests:python3.8:
image: $CONTAINER_REGISTRY:python3.8
stage: test
script:
- python3 -m unittest discover -v
tests:python3.9:
image: $CONTAINER_REGISTRY:python3.9
stage: test
script:
- python3 -m unittest discover -v
tests:python3.10:
image: $CONTAINER_REGISTRY:python3.10
stage: test
script:
- python3 -m unittest discover -v
tests:python3.11:
image: $CONTAINER_REGISTRY:python3.11
stage: test
script:
- python3 -m unittest discover -v
tests:python3.12:
image: $CONTAINER_REGISTRY:python3.12
stage: test
script:
- python3 -m unittest discover -v
tests:python3.13:
image: $CONTAINER_REGISTRY:python3.13
stage: test
script:
- python3 -m unittest discover -v
tests:python3.14:
image: $CONTAINER_REGISTRY:python3.14
stage: test
script:
- dnf install -y python3 python3-mutagen python3-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 gdk-pixbuf2-modules cairo-gobject cairo python3-cairo perl-Image-ExifTool mailcap
- gdk-pixbuf-query-loaders-64 > /usr/lib64/gdk-pixbuf-2.0/2.10.0/loaders.cache
- python3 setup.py test
- python3 -m unittest discover -v
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> totallylegit <totallylegit@dustri.org>
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> jvoisin <julien.voisin@dustri.org>
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> jvoisin <jvoisin@riseup.net>
Daniel Kahn Gillmor <dkg@fifthhorseman.net> dkg <dkg@fifthhorseman.net>
......@@ -6,11 +6,13 @@ max-locals=20
disable=
fixme,
invalid-name,
duplicate-code,
missing-docstring,
protected-access,
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
raise-missing-from,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
# 0.2.0 - 2018-07-010
# 0.13.5 - 2025-01-09
- Keep orientation metadata on jpeg and tiff files
- Improve cairo-related error/exceptions handling
- Improve the logging
- Improve the sandboxing
- Improve Python3.12 support
- Improve MSOffice documents handling
# 0.13.4 - 2023-08-02
- Add documentation about mat2 on OSX
- Make use of python3.7 constructs to simplify code
- Use moderner type annotations
- Harden get_meta in archive.py against variants of CVE-2022-35410
- Improve MSOffice document support
- Package the manpage on pypi
# 0.13.3 - 2023-02-23
- Fix a decorator argument
# 0.13.2 - 2023-01-28
- Fix a crash on some python versions
# 0.13.1 - 2023-01-07
- Improve xlsx support
- Remove the Nautilus extension
# 0.13.0 - 2022-07-06
- Fix an arbitrary file read (CVE-2022-35410)
- Add support for heic files
# 0.12.4 - 2022-04-30
- Fix possible errors/crashes when processing multiple files
via the command line interface
- Use a fixed PDF version for the output
- Improve compatibility with modern versions of rsvg
- Improve the robustness of the command line interface with
regard to control characters
# 0.12.3 - 2022-01-06
- Implement code for internationalization
- Keep individual files compression type in zip files
- Increase the robustness of mat2 against weird/corrupted files
- Fix the dolphin integration
- Add a fuzzer
# 0.12.2 - 2021-08-29
- Add support for aiff files
- Improve MS Office support
- Improve compatibility with newer/older version of mat2's dependencies
- Fix possible issues with the resolution of processed pdf
# 0.12.1 - 2021-03-19
- Improve epub support
- Improve MS Office support
# 0.12.0 - 2020-12-18
- Improve significantly MS Office formats support
- Fix some typos in the Nautilus extension
- Improve reliability of the mp3, pdf and svg parsers
- Improve compatibility with ffmpeg when sandboxing is used
- Improve the dolphin extension usability
- libmat2 now raises a ValueError on malformed files while trying to
find the right parser, instead of returning None
# 0.11.0 - 2020-03-29
- Improve significantly MS Office formats support
- Refactor how mat2 looks for executables
# 0.10.1 - 2020-02-09
- Improve the documentation and the manpage
- Improve the robustness of css, html, png, gdk-based, exiftool-based parsers
- Future-proof a bit the testsuite
- Handle tiff files with a .tif extension
- Improve the sandbox' usability
- Add support for wav files
# 0.10.0 - 2019-11-30
- Make mat2 work on Python3.8
- Minor improvement of ppt handling
- Minor improvement of odt handling
- Add an integration KDE's file manager: Dolphin
- mat2 now copies file permissions on the cleaned files
- Add a flag to disable sandboxing
- Tighten a bit the sandboxing
- Improve handling of MSOffice documents
- Add support for inplace cleaning
- Better handling of mutually-exclusive arguments in the command line
- Add support for svg
- Add support for ppm
- Cleaned zip files are compressed by default
- Minor performances improvement when dealing with images/video files
- Better handling of optional dependencies
# 0.9.0 - 2019-05-10
- Add tar/tar.gz/tar.bz2/tar.zx archives support
- Add support for xhtml files
- Improve handling of read-only files
- Improve a bit the command line's documentation
- Fix a confusing error message
- Add even more tests
- Usuals internal cleanups/refactorings
# 0.8.0 - 2019-02-28
- Add support for epub files
- Fix the setup.py file crashing on non-utf8 platforms
- Improve css support
- Improve html support
# 0.7.0 - 2019-02-17
- Add support for wmv files
- Add support for gif files
- Add support for html files
- Sandbox external processes via bubblewrap
- Simplify archive-based formats processing
- The Nautilus extension now plays nicer with other extensions
# 0.6.0 - 2018-11-10
- Add lightweight cleaning for jpeg
- Add support for zip files
- Add support for mp4 files
- Improve metadata extraction for archives
- Improve robustness against corrupted embedded files
- Fix a possible security issue on some terminals (control character
injection via --show)
- Various internal cleanup/improvements
# 0.5.0 - 2018-10-23
- Video (.avi files for now) support, via FFmpeg, optionally
- Lightweight cleaning for png and tiff files
- Processing files starting with a dash is now quicker
- Metadata are now displayed sorted
- Recursive metadata support for FLAC files
- Unsupported extensions aren't displayed in `./mat2 -l` anymore
- Improve the display when no metadata are found
- Update the logo according to the GNOME guidelines
- The testsuite is now runnable on the installed version of mat2
- Various internal cleanup/improvements
# 0.4.0 - 2018-10-03
- There is now a policy, for advanced users, to deal with unknown embedded fileformats
- Improve the documentation
- Various minor refactoring
- Improve how corrupted PNG are handled
- Dangerous/advanced cli's options no longer have short versions
- Significant improvements to office files anonymisation
- Archive members are sorted lexicographically
- XML attributes are sorted lexicographically too
- RSID are now stripped
- Dangling references in [Content_types].xml are now removed
- Significant improvements to office files support
- Anonimysed office files can now be opened by MS Office without warnings
- The CLI isn't threaded anymore, for it was causing issues
- Various misc typo fix
# 0.3.1 - 2018-09-01
- Document how to install mat2 for various distributions
- Fix various typos in the documentation/comments
- Add ArchLinux to the CI to ensure that mat2 is running on it
- Fix the handling of files with a name ending in `.JPG`
- Improve the detection of unsupported extensions in upper-case
- Streamline mat2's logging
# 0.3.0 - 2018-08-03
- Add a check for missing dependencies
- Add Nautilus extension
- Minors code simplifications
- Improve our linters' coverage
- Add a manpage
- Add folder/multiple files related tests
- Change the logo
# 0.2.0 - 2018-07-10
- Fix various crashes dues to malformed files
- Simplify various code-paths
- Remove superfluous debug message
- Remove the `--check` option that never was implemented anyway
- Add a `-c` option to check for MAT2's dependencies
- Add a `-c` option to check for mat2's dependencies
# 0.1.3 - 2018-07-06
- Improve MAT2 resilience against corrupted images
- Improve mat2 resilience against corrupted images
- Check that the minimal version of Poppler is available
- Simplify how MAT2 deals with office files
- Simplify how mat2 deals with office files
- Improve cleaning of office files
- Thumbnails are removed
- Revisions are removed
......@@ -23,8 +217,8 @@
- Rename some files to ease the packaging
- Add linters to the CI (mypy, bandit and pyflakes)
- Prevent exitftool-related parameters injections
- Improve MAT2's resilience against corrupted files
- Make MAT2 work on fedora, thanks to @atenart
- Improve mat2's resilience against corrupted files
- Make mat2 work on fedora, thanks to @atenart
- Tighten the threat model
- Simplify and improve how office files are handled
......
# Contributing to MAT2
# Contributing to mat2
The main repository for MAT2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
with a mirror on [gitlab.com]( https://gitlab.com/jvoisin/mat2 ).
The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
and to send a pull-request. Please do check that everything is fine by running the
testsuite with `python3 -m unittest discover -v` before submitting one :)
and to send a pull-request.
Before sending the pull-request, please do check that everything is fine by
running the full test suite in GitLab. To do that, after forking mat2 in GitLab,
you need to go in Settings -> CI/CD -> Runner and there enable shared runners.
Mat2 also has unit tests (that are also run in the full test suite). You can run
them with `python3 -m unittest discover -v`.
If you're fixing a bug or adding a new feature, please add tests accordingly,
this will greatly improve the odds of your merge-request getting merged.
......@@ -16,7 +22,7 @@ If you're adding a new fileformat, please add tests for:
2. Cleaning metadata
3. Raising `ValueError` upon a corrupted file
Since MAT2 is written in Python3, please conform as much as possible to the
Since mat2 is written in Python3, please conform as much as possible to the
[pep8]( https://pep8.org/ ) style; except where it makes no sense of course.
# Doing a release
......@@ -24,9 +30,16 @@ Since MAT2 is written in Python3, please conform as much as possible to the
1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
4. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat.1)
5. Commit the changelog, man page, mat2 and setup.py files
6. Create a tag with `git tag -s $VERSION`
7. Push the commit with `git push origin master`
8. Push the tag with `git push --tags`
9. Do the secret release dance
4. Update the version in the [pyproject.toml](https://0xacab.org/jvoisin/mat2/blob/master/yproject.toml) file
5. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
6. Commit the modified files
7. Create a tag with `git tag -s $VERSION`
8. Push the commit with `git push origin master`
9. Push the tag with `git push --tags`
10. Download the gitlab archive of the release
11. Diff it against the local copy
12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
13. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
16. Do the secret release dance
# Python ecosystem
If you feel like running arbitrary code downloaded over the
internet (pypi doesn't support gpg signatures [anymore](https://github.com/pypa/python-packaging-user-guide/pull/466)),
mat2 is [available on pypi](https://pypi.org/project/mat2/), and can be
installed like this:
```
pip3 install mat2
```
# GNU/Linux
## Optional dependencies
When [bubblewrap](https://github.com/projectatomic/bubblewrap) is
installed, mat2 uses it to sandbox any external processes it invokes.
## Arch Linux
Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
[Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).
## Debian
There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all) and you can install mat2 with:
```
apt install mat2
```
## Fedora
Thanks to [atenart](https://ack.tf/), there is a package available on
[Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
First you need to enable mat2's copr:
```
dnf -y copr enable atenart/mat2
```
Then you can install mat2:
```
dnf -y install mat2
```
## Gentoo
mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
# OSX
## Homebrew
mat2 is [available on homebrew](https://formulae.brew.sh/formula/mat2):
```
brew install mat2
```
## MacPorts
mat2 is [available on MacPorts](https://ports.macports.org/port/mat2/):
```
port install mat2
```
```
_____ _____ _____ ___
| | _ |_ _|_ | Keep your data,
| | | | | | | | _| trash your meta!
|_|_|_|__|__| |_| |___|
| | | | |_| | | | | _| trash your meta!
|_|_|_|_| |_| |_| |___|
```
This software is currently in **beta**, please don't use it for anything
critical.
# Metadata and privacy
Metadata consist of information that characterizes data.
......@@ -20,20 +17,37 @@ Metadata within a file can tell a lot about you.
Cameras record data about when a picture was taken and what
camera was used. Office documents like PDF or Office automatically adds
author and company information to documents and spreadsheets.
Maybe you don't want to disclose those information on the web.
Maybe you don't want to disclose those information.
This is precisely the job of MAT2: getting rid, as much as possible, of
This is precisely the job of mat2: getting rid, as much as possible, of
metadata.
mat2 provides:
- a library called `libmat2`;
- a command line tool called `mat2`,
- a service menu for Dolphin, KDE's default file manager
If you prefer a regular graphical user interface, you might be interested in
[Metadata Cleaner](https://metadatacleaner.romainvigier.fr/), which is using
`mat2` under the hood.
# Requirements
- `python3-mutagen` for audio support
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
- `gir1.2-gdkpixbuf-2.0` for images support
- `gir1.2-rsvg-2.0` for svg support
- `FFmpeg`, optionally, for video support
- `libimage-exiftool-perl` for everything else
- `bubblewrap`, optionally, for sandboxing
Please note that mat2 requires at least Python3.5.
Please note that MAT2 requires at least Python3.5, meaning that it
doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3),
# Requirements setup on macOS (OS X) using [Homebrew](https://brew.sh/)
```bash
brew install exiftool cairo pygobject3 poppler gdk-pixbuf librsvg ffmpeg
```
# Running the test suite
......@@ -41,35 +55,85 @@ doesn't run on [Debian Jessie](https://packages.debian.org/jessie/python3),
$ python3 -m unittest discover -v
```
# How to use MAT2
And if you want to see the coverage:
```bash
usage: mat2 [-h] [-v] [-l] [-s | -L] [files [files ...]]
$ python3-coverage run --branch -m unittest discover -s tests/
$ python3-coverage report --include -m --include /libmat2/*'
```
# How to use mat2
```
usage: mat2 [-h] [-V] [--unknown-members policy] [--inplace] [--no-sandbox]
[-v] [-l] [--check-dependencies] [-L | -s]
[files [files ...]]
Metadata anonymisation toolkit 2
positional arguments:
files
files the files to process
optional arguments:
-h, --help show this help message and exit
-v, --version show program's version number and exit
-l, --list list all supported fileformats
-s, --show list all the harmful metadata of a file without removing
them
-L, --lightweight remove SOME metadata
-h, --help show this help message and exit
-V, --verbose show more verbose status information
--unknown-members policy
how to handle unknown members of archive-style files
(policy should be one of: abort, omit, keep) [Default:
abort]
--inplace clean in place, without backup
--no-sandbox Disable bubblewrap's sandboxing
-v, --version show program's version number and exit
-l, --list list all supported fileformats
--check-dependencies check if mat2 has all the dependencies it needs
-L, --lightweight remove SOME metadata
-s, --show list harmful metadata detectable by mat2 without
removing them
```
Note that mat2 **will not** clean files in-place, but will produce, for
example, with a file named "myfile.png" a cleaned version named
"myfile.cleaned.png".
## Web interface
It's possible to run mat2 as a web service, via
[mat2-web](https://0xacab.org/jvoisin/mat2-web).
If you're using WordPress, you might be interested in [wp-mat](https://git.autistici.org/noblogs/wp-mat)
and [wp-mat-server](https://git.autistici.org/noblogs/wp-mat-server).
## Desktop GUI
For GNU/Linux desktops, it's possible to use the
[Metadata Cleaner](https://gitlab.com/rmnvgr/metadata-cleaner) GTK application.
# Supported formats
The following formats are supported: avi, bmp, css, epub/ncx, flac, gif, jpeg,
m4a/mp2/mp3/…, mp4, odc/odf/odg/odi/odp/ods/odt/…, off/opus/oga/spx/…, pdf,
png, ppm, pptx/xlsx/docx/…, svg/svgz/…, tar/tar.gz/tar.bz2/tar.xz/…, tiff,
torrent, wav, wmv, zip, …
# Notes about detecting metadata
While MAT2 is doing its very best to display metadata when the `--show` flag is
passed, it doesn't mean that a file is clean from any metadata if MAT2 doesn't
While mat2 is doing its very best to display metadata when the `--show` flag is
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
show any. There is no reliable way to detect every single possible metadata for
complex file formats.
This is why you shouldn't rely on metadata's presence to decide if your file must
be cleaned or not.
# Notes about the lightweight mode
By default, mat2 might alter a bit the data of your files, in order to remove
as much metadata as possible. For example, texts in PDF might not be selectable anymore,
compressed images might get compressed again, …
Since some users might be willing to trade some metadata's presence in exchange
of the guarantee that mat2 won't modify the data of their files, there is the
`-L` flag that precisely does that.
# Related software
- The first iteration of [MAT](https://mat.boum.org)
......@@ -78,14 +142,23 @@ be cleaned or not.
tries to deal with *printer dots* too.
- [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes
watermarks from PDF.
- [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
an open-source Android application to remove metadata from pictures.
- [Dangerzone](https://dangerzone.rocks/), designed to sanitize harmful documents
into harmless ones.
# Contact
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues).
If you think that a more private contact is needed (eg. for reporting security issues),
you can email Julien (jvoisin) Voisin at `julien.voisin+mat@dustri.org`,
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
or the [mailing list](https://www.autistici.org/mailman/listinfo/mat-dev)
Should a more private contact be needed (eg. for reporting security issues),
you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
# Donations
If you want to donate some money, please give it to [Tails]( https://tails.boum.org/donate/?r=contribute ).
# License
This program is free software: you can redistribute it and/or modify
......@@ -101,15 +174,20 @@ GNU General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
Copyright 2016 Marie-Rose for mat2's logo
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
The `narrated_powerpoint_presentation.pptx` file is in the public domain.
# Thanks
MAT2 wouldn't exist without:
mat2 wouldn't exist without:
- the [Google Summer of Code](https://summerofcode.withgoogle.com/);
- the fine people from [Tails]( https://tails.boum.org);
- friends
Many thanks to them!
data/mat2.png

3.13 KiB | W: 0px | H: 0px

data/mat2.png

28.2 KiB | W: 0px | H: 0px

data/mat2.png
data/mat2.png
data/mat2.png
data/mat2.png
  • 2-up
  • Swipe
  • Onion skin
This diff is collapsed.
# Exiftool
mat2 is in fact using exiftool to extract metadata from files,
but not to remove them. The previous iteration of mat2, MAT,
was using exiftool to remove metadata, which lead to several cases where
they weren't correctly removed, if at all.
For example, [Exiftool's documentation](https://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PDF.html)
states the following with regard to PDF:
> All metadata edits are reversible. While this would normally be considered an
advantage, it is a potential security problem because old information is never
actually deleted from the file.
To remove metadata, mat2 usually re-render the file completely, eliminating
all possible original metadata. See the `implementation_notes.md` file for
details.
# jpegoptim, optipng, …
While designed to reduce as much as possible the size of pictures,
those software can be used to remove metadata. They usually have very good
support for a single picture format, and can be used in place of mat2 for them.
# PDF Redact Tools
[PDF Redact Tools](https://github.com/firstlookmedia/pdf-redact-tools) is
a software developed by the people from [First Look
Media](https://firstlook.media/), the entity behind, amongst other things,
[The Intercept](https://theintercept.com/).
The tool uses roughly the same approach than mat2 to deal with PDF,
which is unfortunately the only fileformat that it does support.
It's interesting to note that it has counter-measures against
[yellow dots](https://en.wikipedia.org/wiki/Machine_Identification_Code),
a capacity that mat2 [doesn't possess yet](https://0xacab.org/jvoisin/mat2/issues/43).
# Exiv2
[Exiv2](https://www.exiv2.org/) was considered for mat2,
but it currently [misses a lot of metadata](https://0xacab.org/jvoisin/mat2/issues/85)
# Others non open source software/online service
There are a lot of closed-source software and online service claiming to remove
metadata from your files, but since there is no way to actually verify that
they're effectively removing them, let alone adding unique markers, they
shouldn't be used.
......@@ -4,41 +4,57 @@ Implementation notes
Lightweight cleaning mode
-------------------------
Due to *popular* request, MAT2 is providing a *lightweight* cleaning mode,
Due to *popular* request, mat2 is providing a *lightweight* cleaning mode,
that only cleans the superficial metadata of your file, but not
the ones that might be in **embeded** resources. Like for example,
the ones that might be in **embedded** resources. Like for example,
images in a PDF or an office document.
Revisions handling
------------------
Revisions are handled according to the principle of least astonishment: they are entirely removed.
Revisions are handled according to the principle of least astonishment: they
are entirely removed.
- Either the users aren't aware of the revisions, are thus they should be deleted. For example journalists that are editing a document to erase mentions sources mentions.
- Either the users aren't aware of the revisions, are thus they should be
deleted. For example journalists that are editing a document to erase
mentions sources mentions.
- Or they are aware of it, and will likely not expect MAT2 to be able to keep the revisions, that are basically traces about how, when and who edited the document.
- Or they are aware of it, and will likely not expect mat2 to be able to keep
the revisions, that are basically traces about how, when and who edited the
document.
Race conditions
---------------
MAT2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. MAT2 doesn't take any measure to
mat2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. mat2 doesn't take any measure to
ensure that the file is not changed between the time the parser is
instantiated, and the call to clean or show the metadata.
Symlink attacks
---------------
MAT2 output predictable filenames (like yourfile.jpg.cleaned).
mat2 output predictable filenames (like yourfile.jpg.cleaned).
This may lead to symlink attack. Please check if you OS prevent
against them
Archives handling
-----------------
MAT2 doesn't support archives yet, because we haven't found an usable way to ask the user
what to do when a non-supported files are encountered.
By default, when cleaning a non-support file format in an archive,
mat2 will abort with a detailed error message.
While strongly discouraged, it's possible to override this behaviour to force
the exclusion, or inclusion of unknown files into the cleaned archive.
While Python's [zipfile](https://docs.python.org/3/library/zipfile.html) module
provides *safe* way to extract members of a zip archive, the
[tarfile](https://docs.python.org/3/library/tarfile.html) one doesn't,
meaning that it's up to mat2 to implement safety checks. Currently,
it defends against path-traversal, both relative and absolute,
symlink-related attacks, setuid/setgid attacks, duplicate members, block and
char devices, … but there might still be dragons lurking there.
PDF handling
------------
......@@ -49,10 +65,10 @@ didn't remove any *deep metadata*, like the ones in embedded pictures. This was
on of the reason MAT was abandoned: the absence of satisfying solution to
handle PDF. But apparently, people are ok with [pdf redact
tools](https://github.com/firstlookmedia/pdf-redact-tools), that simply
transform the PDF into images. So this is what's MAT2 is doing too.
transform the PDF into images. So this is what's mat2 is doing too.
Of course, it would be possible to detect images in PDf file, and process them
with MAT2, but since a PDF can contain a lot of things, like images, videos,
with mat2, but since a PDF can contain a lot of things, like images, videos,
javascript, pdf, blobs, … this is the easiest and safest way to clean them.
Images handling
......@@ -61,3 +77,11 @@ Images handling
When possible, images are handled like PDF: rendered on a surface, then saved
to the filesystem. This ensures that every metadata is removed.
XML attacks
-----------
Since our threat model conveniently excludes files crafted to specifically
bypass mat2, fileformats containing harmful XML are out of our scope.
But since mat2 is using [etree](https://docs.python.org/3/library/xml.html#xml-vulnerabilities)
to process XML, it's "only" vulnerable to DoS, and not memory corruption:
odds are that the user will notice that the cleaning didn't succeed.
.TH MAT2 "1" "July 2018" "MAT2 0.2.0" "User Commands"
.TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
.SH SYNOPSIS
mat2 [\-h] [\-v] [\-l] [\-c] [\-s | \-L]\fR [files [files ...]]
\fBmat2\fR [\-h] [\-v] [\-l] [\-V] [-s | -L] [\fIfiles\fR [\fIfiles ...\fR]]
.SH DESCRIPTION
.B mat2
removes metadata from various fileformats. It supports a wide variety of file
removes metadata from various fileformats. It supports a wide variety of file
formats, audio, office, images, …
Careful, mat2 does not clean files in-place, instead, it will produce a file with the word
"cleaned" between the filename and its extension, for example "filename.cleaned.png"
for a file named "filename.png".
.SH OPTIONS
.SS "positional arguments:"
.TP
......@@ -27,15 +31,26 @@ show program's version number and exit
\fB\-l\fR, \fB\-\-list\fR
list all supported fileformats
.TP
\fB\-c\fR, \fB\-\-check\-dependencies\fR
check if MAT2 has all the dependencies it needs
\fB\-\-check\-dependencies\fR
check if mat2 has all the dependencies it needs
.TP
\fB\-V\fR, \fB\-\-verbose\fR
show more verbose status information
.TP
\fB\-\-unknown-members\fR \fIpolicy\fR
how to handle unknown members of archive-style files (policy should be one of: abort, omit, keep)
.TP
\fB\-s\fR, \fB\-\-show\fR
list harmful metadata detectable by MAT2 without
removing them
list harmful metadata detectable by mat2 without removing them
.TP
\fB\-L\fR, \fB\-\-lightweight\fR
remove SOME metadata
.TP
\fB\--no-sandbox\fR
disable bubblewrap's sandboxing
.TP
\fB\--inplace\fR
clean in place, without backup
.SH EXAMPLES
To remove all the metadata from a PDF file:
......@@ -47,6 +62,24 @@ mat2 ./myfile.pdf
.fi
.PP
.SH NOTES ABOUT METADATA
While mat2 is doing its very best to display metadata when the --show flag is
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
show any. There is no reliable way to detect every single possible metadata for
complex file formats.
.PP
This is why you shouldn't rely on metadata's presence to decide if your file must
be cleaned or not.
.PP
Moreover, mat2 goes to great lengths to make sure that as much metadata as
possible are removed. This might sometimes result in a loss of quality of the
processed files. For example, textual based pdf file converted into image based
one means that it'll be no longer possible to select text in them. If you're
experiencing this, you might want to give the lightweight cleaning mode a try,
but keep in mind by doing so, some metadata \fBwon't be cleaned\fR.
.SH BUGS
While mat2 does its very best to remove every single metadata,
......
......@@ -3,7 +3,7 @@ Threat Model
The Metadata Anonymisation Toolkit 2 adversary has a number
of goals, capabilities, and counter-attack types that can be
used to guide us towards a set of requirements for the MAT2.
used to guide us towards a set of requirements for the mat2.
This is an overhaul of MAT's (the first iteration of the software) one.
......@@ -53,7 +53,7 @@ Adversary
user. This is the strongest position for the adversary to
have. In this case, the adversary is capable of inserting
arbitrary, custom watermarks specifically for tracking
the user. In general, MAT2 cannot defend against this
the user. In general, mat2 cannot defend against this
adversary, but we list it for completeness' sake.
- The adversary created the document for a group of users.
......@@ -65,7 +65,7 @@ Adversary
- The adversary did not create the document, the weakest
position for the adversary to have. The file format is
(most of the time) standard, nothing custom is added:
MAT2 must be able to remove all metadata from the file.
mat2 must be able to remove all metadata from the file.
Requirements
......@@ -73,28 +73,28 @@ Requirements
* Processing
- MAT2 *should* avoid interactions with information.
- mat2 *should* avoid interactions with information.
Its goal is to remove metadata, and the user is solely
responsible for the information of the file.
- MAT2 *must* warn when encountering an unknown
format. For example, in a zipfile, if MAT encounters an
- mat2 *must* warn when encountering an unknown
format. For example, in a zipfile, if mat2 encounters an
unknown format, it should warn the user, and ask if the
file should be added to the anonymised archive that is
produced.
- MAT2 *must* not add metadata, since its purpose is to
- mat2 *must* not add metadata, since its purpose is to
anonymise files: every added items of metadata decreases
anonymity.
- MAT2 *should* handle unknown/hidden metadata fields,
- mat2 *should* handle unknown/hidden metadata fields,
like proprietary extensions of open formats.
- MAT2 *must not* fail silently. Upon failure,
MAT2 *must not* modify the file in any way.
- mat2 *must not* fail silently. Upon failure,
mat2 *must not* modify the file in any way.
- MAT2 *might* leak the fact that MAT2 was used on the file,
- mat2 *might* leak the fact that mat2 was used on the file,
since it might be uncommon for some file formats to come
without any kind of metadata, an adversary might suspect that
the user used MAT2 on certain files.
the user used mat2 on certain files.
Dolphin integration
===================
Thanks to [Miguel Marco](https://riemann.unizar.es/~mmarco/), here is an neat
integration for [Dolphin](https://kde.org/applications/system/org.kde.dolphin),
the KDE file manager:
1. Add the `mat2.desktop` file either in
- `/usr/share/kservices5/ServiceMenus/` to install it globally
- `~/.local/share/kservices5/ServiceMenus/` for a specific user
2. Run `kbuildsycoca5` to update the corresponding database
3. Enjoy your new contextual menu to remove metadata from your files!
[Desktop Entry]
X-KDE-ServiceTypes=KonqPopupMenu/Plugin
MimeType=application/pdf;application/vnd.oasis.opendocument.chart;application/vnd.oasis.opendocument.formula;application/vnd.oasis.opendocument.graphics;application/vnd.oasis.opendocument.image;application/vnd.oasis.opendocument.presentation;application/vnd.oasis.opendocument.spreadsheet;application/vnd.oasis.opendocument.text;application/vnd.openxmlformats-officedocument.presentationml.presentation;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;application/vnd.openxmlformats-officedocument.wordprocessingml.document;application/x-bittorrent;application/zip;audio/flac;audio/mpeg;audio/ogg;audio/x-flac;image/jpeg;image/png;image/tiff;image/x-ms-bmp;text/plain;video/mp4;video/x-msvideo;
Actions=cleanMetadata;
Type=Service
[Desktop Action cleanMetadata]
Name=Clean metadata
Name[de]=Metadaten löschen
Name[es]=Limpiar metadatos
Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
Exec=kdialog --yesno "$( mat2 -s %F )" --title "Clean Metadata?" && mat2 %U
Exec[de]=kdialog --yesno "$( mat2 -s %F )" --title "Metadaten löschen?" && mat2 %U
#!/bin/env python3
#!/usr/bin/env python3
import os
import collections
import enum
import importlib
from typing import Dict
# make pyflakes happy
assert Dict
from . import exiftool, video
# A set of extension that aren't supported, despite matching a supported mimetype
UNSUPPORTED_EXTENSIONS = {
......@@ -27,27 +25,72 @@ UNSUPPORTED_EXTENSIONS = {
}
DEPENDENCIES = {
'cairo': 'Cairo',
'gi': 'PyGobject',
'gi.repository.GdkPixbuf': 'GdkPixbuf from PyGobject',
'gi.repository.Poppler': 'Poppler from PyGobject',
'gi.repository.GLib': 'GLib from PyGobject',
'mutagen': 'Mutagen',
}
'Cairo': {
'module': 'cairo',
'required': True,
},
'PyGobject': {
'module': 'gi',
'required': True,
},
'GdkPixbuf from PyGobject': {
'module': 'gi.repository.GdkPixbuf',
'required': True,
},
'Poppler from PyGobject': {
'module': 'gi.repository.Poppler',
'required': True,
},
'GLib from PyGobject': {
'module': 'gi.repository.GLib',
'required': True,
},
'Mutagen': {
'module': 'mutagen',
'required': True,
},
}
CMD_DEPENDENCIES = {
'Exiftool': {
'cmd': exiftool._get_exiftool_path,
'required': False,
},
'Ffmpeg': {
'cmd': video._get_ffmpeg_path,
'required': False,
},
}
def check_dependencies() -> dict:
ret = collections.defaultdict(bool) # type: Dict[str, bool]
exiftool = '/usr/bin/exiftool'
ret['Exiftool'] = False
if os.path.isfile(exiftool) and os.access(exiftool, os.X_OK): # pragma: no cover
ret['Exiftool'] = True
def check_dependencies() -> Dict[str, Dict[str, bool]]:
ret: Dict[str, Dict] = dict()
for key, value in DEPENDENCIES.items():
ret[value] = True
ret[key] = {
'found': True,
'required': value['required'],
}
try:
importlib.import_module(key)
importlib.import_module(value['module']) # type: ignore
except ImportError: # pragma: no cover
ret[value] = False # pragma: no cover
ret[key]['found'] = False
for k, v in CMD_DEPENDENCIES.items():
ret[k] = {
'found': True,
'required': v['required'],
}
try:
v['cmd']() # type: ignore
except RuntimeError: # pragma: no cover
ret[k]['found'] = False
return ret
@enum.unique
class UnknownMemberPolicy(enum.Enum):
ABORT = 'abort'
OMIT = 'omit'
KEEP = 'keep'
import abc
import os
from typing import Set, Dict
assert Set # make pyflakes happy
import re
from typing import Union, Set, Dict
class AbstractParser(abc.ABC):
""" This is the base classe of every parser.
It might yield `ValueError` on instanciation on invalid files.
""" This is the base class of every parser.
It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
"""
meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str]
meta_list: Set[str] = set()
mimetypes: Set[str] = set()
def __init__(self, filename: str) -> None:
"""
:raises ValueError: Raised upon an invalid file
"""
if re.search('^[a-z0-9./]', filename) is None:
# Some parsers are calling external binaries,
# this prevents shell command injections
filename = os.path.join('.', filename)
self.filename = filename
fname, extension = os.path.splitext(filename)
# Special case for tar.gz, tar.bz2, … files
if fname.endswith('.tar') and len(fname) > 4:
fname, extension = fname[:-4], '.tar' + extension
self.output_filename = fname + '.cleaned' + extension
self.lightweight_cleaning = False
self.sandbox = True
@abc.abstractmethod
def get_meta(self) -> Dict[str, str]:
pass # pragma: no cover
def get_meta(self) -> Dict[str, Union[str, Dict]]:
"""Return all the metadata of the current file
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
@abc.abstractmethod
def remove_all(self) -> bool:
pass # pragma: no cover
"""
Remove all the metadata of the current file
def remove_all_lightweight(self) -> bool:
""" This method removes _SOME_ metadata.
It might be useful to implement it for fileformats that do
not support non-destructive cleaning.
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
return self.remove_all()
This diff is collapsed.
import mimetypes
import os
import shutil
import tempfile
from typing import Union, Dict
import mutagen
from . import abstract
from . import abstract, parser_factory, video
class MutagenParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
try:
mutagen.File(self.filename)
if mutagen.File(self.filename) is None:
raise ValueError
except mutagen.MutagenError:
raise ValueError
def get_meta(self):
def get_meta(self) -> Dict[str, Union[str, Dict]]:
f = mutagen.File(self.filename)
if f.tags:
return {k:', '.join(v) for k, v in f.tags.items()}
return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
return {}
def remove_all(self):
def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename)
f.delete()
f.save()
try:
f.delete()
f.save()
except mutagen.MutagenError:
raise ValueError
return True
class MP3Parser(MutagenParser):
mimetypes = {'audio/mpeg', }
def get_meta(self):
metadata = {}
def get_meta(self) -> Dict[str, Union[str, Dict]]:
metadata: Dict[str, Union[str, Dict]] = dict()
meta = mutagen.File(self.filename).tags
if not meta:
return metadata
for key in meta:
if isinstance(key, tuple):
metadata[key[0]] = key[1]
continue
if not hasattr(meta[key], 'text'): # pragma: no cover
continue
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
return metadata
......@@ -44,3 +59,56 @@ class OGGParser(MutagenParser):
class FLACParser(MutagenParser):
mimetypes = {'audio/flac', 'audio/x-flac'}
def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename)
f.clear_pictures()
f.delete()
f.save(deleteid3=True)
return True
def get_meta(self) -> Dict[str, Union[str, Dict]]:
meta = super().get_meta()
for num, picture in enumerate(mutagen.File(self.filename).pictures):
name = picture.desc if picture.desc else 'Cover %d' % num
extension = mimetypes.guess_extension(picture.mime)
if extension is None: # pragma: no cover
meta[name] = 'harmful data'
continue
_, fname = tempfile.mkstemp()
fname = fname + extension
with open(fname, 'wb') as f:
f.write(picture.data)
p, _ = parser_factory.get_parser(fname) # type: ignore
if p is None:
raise ValueError
p.sandbox = self.sandbox
# Mypy chokes on ternaries :/
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
os.remove(fname)
return meta
class WAVParser(video.AbstractFFmpegParser):
mimetypes = {'audio/x-wav', }
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
'Duration', 'Encoding', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate',
'FileModifyDate', 'FileName', 'FilePermissions',
'FileSize', 'FileType', 'FileTypeExtension',
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
}
class AIFFParser(video.AbstractFFmpegParser):
mimetypes = {'audio/aiff', 'audio/x-aiff'}
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
'Duration', 'Encoding', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate',
'FileModifyDate', 'FileName', 'FilePermissions',
'FileSize', 'FileType', 'FileTypeExtension',
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
'NumSampleFrames', 'SampleSize',
}