Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • tguinot/mat2
  • jvoisin/mat2
  • dachary/mat2
  • mejo-/mat2
  • LogicalDash/mat2
  • dkg/mat2
  • christian/mat2
  • Selflike323/mat2
  • fz/mat2
  • iwwmidatlanticgdc/mat2
  • Gu1nn3zz/mat2
  • smagnin/mat2
  • flashcode/mat2
  • MANCASTILLEJA/mat2
  • jboursier/mat2
  • tails/mat2
  • matiargs/mat2
  • Brolf/mat2
  • madaidan/mat2
  • Delmer84/mat2
  • yuebyzua/mat2
  • yyyyyyyan/mat2
  • rmnvgr/mat2
  • Marxism-Leninism/mat2
  • GNUtoo/mat2
  • allexj/mat2
  • b068931cc450442b63f5b3d276ea4297/mat2
  • chenrui/mat2
  • nosec13346/mat2
  • anelki/mat2
30 results
Show changes
Commits on Source (549)
.*
*.pyc *.pyc
.coverage
.eggs
.mypy_cache/
build
dist
mat2.egg-info
tags
image: debian variables:
CONTAINER_REGISTRY: $CI_REGISTRY/georg/mat2-ci-images
GIT_DEPTH: "5"
GIT_STRATEGY: clone
test: stages:
- linting
- test
.prepare_env: &prepare_env
before_script: # This is needed to not run the testsuite as root
- useradd --home-dir ${CI_PROJECT_DIR} mat2
- chown -R mat2 .
linting:ruff:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- apt update
- apt install -qqy --no-install-recommends python3-venv
- python3 -m venv venv
- source venv/bin/activate
- pip3 install ruff
- ruff check .
linting:mypy:
image: $CONTAINER_REGISTRY:linting
stage: linting
script:
- mypy --ignore-missing-imports mat2 libmat2/*.py
tests:archlinux:
image: $CONTAINER_REGISTRY:archlinux
stage: test
script:
- python3 -m unittest discover -v
tests:debian:
image: $CONTAINER_REGISTRY:debian
stage: test
<<: *prepare_env
script:
- apt-get -qqy purge bubblewrap
- su - mat2 -c "python3-coverage run --branch -m unittest discover -s tests/"
- su - mat2 -c "python3-coverage report --fail-under=95 -m --include 'libmat2/*'"
tests:debian_with_bubblewrap:
image: $CONTAINER_REGISTRY:debian
stage: test
allow_failure: true
<<: *prepare_env
script:
- apt-get -qqy install bubblewrap
- python3 -m unittest discover -v
tests:fedora:
image: $CONTAINER_REGISTRY:fedora
stage: test
script:
- python3 -m unittest discover -v
tests:gentoo:
image: $CONTAINER_REGISTRY:gentoo
stage: test
<<: *prepare_env
script:
- su - mat2 -c "python3 -m unittest discover -v"
tests:python3.7:
image: $CONTAINER_REGISTRY:python3.7
stage: test
script:
- python3 -m unittest discover -v
tests:python3.8:
image: $CONTAINER_REGISTRY:python3.8
stage: test
script:
- python3 -m unittest discover -v
tests:python3.9:
image: $CONTAINER_REGISTRY:python3.9
stage: test
script:
- python3 -m unittest discover -v
tests:python3.10:
image: $CONTAINER_REGISTRY:python3.10
stage: test
script:
- python3 -m unittest discover -v
tests:python3.11:
image: $CONTAINER_REGISTRY:python3.11
stage: test
script:
- python3 -m unittest discover -v
tests:python3.12:
image: $CONTAINER_REGISTRY:python3.12
stage: test
script:
- python3 -m unittest discover -v
tests:python3.13:
image: $CONTAINER_REGISTRY:python3.13
stage: test
script:
- python3 -m unittest discover -v
tests:python3.14:
image: $CONTAINER_REGISTRY:python3.14
stage: test
script: script:
- apt-get -qqy update - python3 -m unittest discover -v
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
- python3-coverage run -m unittest discover -s tests/
- python3-coverage report -m --include 'src/*'
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> totallylegit <totallylegit@dustri.org>
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> jvoisin <julien.voisin@dustri.org>
Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> jvoisin <jvoisin@riseup.net>
Daniel Kahn Gillmor <dkg@fifthhorseman.net> dkg <dkg@fifthhorseman.net>
[FORMAT]
good-names=e,f,i,x,s
max-locals=20
[MESSAGES CONTROL]
disable=
fixme,
invalid-name,
duplicate-code,
missing-docstring,
protected-access,
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
raise-missing-from,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
# 0.13.5 - 2025-01-09
- Keep orientation metadata on jpeg and tiff files
- Improve cairo-related error/exceptions handling
- Improve the logging
- Improve the sandboxing
- Improve Python3.12 support
- Improve MSOffice documents handling
# 0.13.4 - 2023-08-02
- Add documentation about mat2 on OSX
- Make use of python3.7 constructs to simplify code
- Use moderner type annotations
- Harden get_meta in archive.py against variants of CVE-2022-35410
- Improve MSOffice document support
- Package the manpage on pypi
# 0.13.3 - 2023-02-23
- Fix a decorator argument
# 0.13.2 - 2023-01-28
- Fix a crash on some python versions
# 0.13.1 - 2023-01-07
- Improve xlsx support
- Remove the Nautilus extension
# 0.13.0 - 2022-07-06
- Fix an arbitrary file read (CVE-2022-35410)
- Add support for heic files
# 0.12.4 - 2022-04-30
- Fix possible errors/crashes when processing multiple files
via the command line interface
- Use a fixed PDF version for the output
- Improve compatibility with modern versions of rsvg
- Improve the robustness of the command line interface with
regard to control characters
# 0.12.3 - 2022-01-06
- Implement code for internationalization
- Keep individual files compression type in zip files
- Increase the robustness of mat2 against weird/corrupted files
- Fix the dolphin integration
- Add a fuzzer
# 0.12.2 - 2021-08-29
- Add support for aiff files
- Improve MS Office support
- Improve compatibility with newer/older version of mat2's dependencies
- Fix possible issues with the resolution of processed pdf
# 0.12.1 - 2021-03-19
- Improve epub support
- Improve MS Office support
# 0.12.0 - 2020-12-18
- Improve significantly MS Office formats support
- Fix some typos in the Nautilus extension
- Improve reliability of the mp3, pdf and svg parsers
- Improve compatibility with ffmpeg when sandboxing is used
- Improve the dolphin extension usability
- libmat2 now raises a ValueError on malformed files while trying to
find the right parser, instead of returning None
# 0.11.0 - 2020-03-29
- Improve significantly MS Office formats support
- Refactor how mat2 looks for executables
# 0.10.1 - 2020-02-09
- Improve the documentation and the manpage
- Improve the robustness of css, html, png, gdk-based, exiftool-based parsers
- Future-proof a bit the testsuite
- Handle tiff files with a .tif extension
- Improve the sandbox' usability
- Add support for wav files
# 0.10.0 - 2019-11-30
- Make mat2 work on Python3.8
- Minor improvement of ppt handling
- Minor improvement of odt handling
- Add an integration KDE's file manager: Dolphin
- mat2 now copies file permissions on the cleaned files
- Add a flag to disable sandboxing
- Tighten a bit the sandboxing
- Improve handling of MSOffice documents
- Add support for inplace cleaning
- Better handling of mutually-exclusive arguments in the command line
- Add support for svg
- Add support for ppm
- Cleaned zip files are compressed by default
- Minor performances improvement when dealing with images/video files
- Better handling of optional dependencies
# 0.9.0 - 2019-05-10
- Add tar/tar.gz/tar.bz2/tar.zx archives support
- Add support for xhtml files
- Improve handling of read-only files
- Improve a bit the command line's documentation
- Fix a confusing error message
- Add even more tests
- Usuals internal cleanups/refactorings
# 0.8.0 - 2019-02-28
- Add support for epub files
- Fix the setup.py file crashing on non-utf8 platforms
- Improve css support
- Improve html support
# 0.7.0 - 2019-02-17
- Add support for wmv files
- Add support for gif files
- Add support for html files
- Sandbox external processes via bubblewrap
- Simplify archive-based formats processing
- The Nautilus extension now plays nicer with other extensions
# 0.6.0 - 2018-11-10
- Add lightweight cleaning for jpeg
- Add support for zip files
- Add support for mp4 files
- Improve metadata extraction for archives
- Improve robustness against corrupted embedded files
- Fix a possible security issue on some terminals (control character
injection via --show)
- Various internal cleanup/improvements
# 0.5.0 - 2018-10-23
- Video (.avi files for now) support, via FFmpeg, optionally
- Lightweight cleaning for png and tiff files
- Processing files starting with a dash is now quicker
- Metadata are now displayed sorted
- Recursive metadata support for FLAC files
- Unsupported extensions aren't displayed in `./mat2 -l` anymore
- Improve the display when no metadata are found
- Update the logo according to the GNOME guidelines
- The testsuite is now runnable on the installed version of mat2
- Various internal cleanup/improvements
# 0.4.0 - 2018-10-03
- There is now a policy, for advanced users, to deal with unknown embedded fileformats
- Improve the documentation
- Various minor refactoring
- Improve how corrupted PNG are handled
- Dangerous/advanced cli's options no longer have short versions
- Significant improvements to office files anonymisation
- Archive members are sorted lexicographically
- XML attributes are sorted lexicographically too
- RSID are now stripped
- Dangling references in [Content_types].xml are now removed
- Significant improvements to office files support
- Anonimysed office files can now be opened by MS Office without warnings
- The CLI isn't threaded anymore, for it was causing issues
- Various misc typo fix
# 0.3.1 - 2018-09-01
- Document how to install mat2 for various distributions
- Fix various typos in the documentation/comments
- Add ArchLinux to the CI to ensure that mat2 is running on it
- Fix the handling of files with a name ending in `.JPG`
- Improve the detection of unsupported extensions in upper-case
- Streamline mat2's logging
# 0.3.0 - 2018-08-03
- Add a check for missing dependencies
- Add Nautilus extension
- Minors code simplifications
- Improve our linters' coverage
- Add a manpage
- Add folder/multiple files related tests
- Change the logo
# 0.2.0 - 2018-07-10
- Fix various crashes dues to malformed files
- Simplify various code-paths
- Remove superfluous debug message
- Remove the `--check` option that never was implemented anyway
- Add a `-c` option to check for mat2's dependencies
# 0.1.3 - 2018-07-06
- Improve mat2 resilience against corrupted images
- Check that the minimal version of Poppler is available
- Simplify how mat2 deals with office files
- Improve cleaning of office files
- Thumbnails are removed
- Revisions are removed
- Add support for plain text files
# 0.1.2 - 2018-06-21
- Rename some files to ease the packaging
- Add linters to the CI (mypy, bandit and pyflakes)
- Prevent exitftool-related parameters injections
- Improve mat2's resilience against corrupted files
- Make mat2 work on fedora, thanks to @atenart
- Tighten the threat model
- Simplify and improve how office files are handled
# 0.1.1 - 2018-05-16
- Improve the cli usage
- Refuse to process files with a supported mimetype but an unsupported
extension, like `text/plain` for a `.c`
# 0.1.0 - 2018-05-14
- Initial release
# Contributing to mat2
The main repository for mat2 is on [0xacab]( https://0xacab.org/jvoisin/mat2 ),
but you can send patches to jvoisin by [email](https://dustri.org/) if you prefer.
Do feel free to pick up [an issue]( https://0xacab.org/jvoisin/mat2/issues )
and to send a pull-request.
Before sending the pull-request, please do check that everything is fine by
running the full test suite in GitLab. To do that, after forking mat2 in GitLab,
you need to go in Settings -> CI/CD -> Runner and there enable shared runners.
Mat2 also has unit tests (that are also run in the full test suite). You can run
them with `python3 -m unittest discover -v`.
If you're fixing a bug or adding a new feature, please add tests accordingly,
this will greatly improve the odds of your merge-request getting merged.
If you're adding a new fileformat, please add tests for:
1. Getting metadata
2. Cleaning metadata
3. Raising `ValueError` upon a corrupted file
Since mat2 is written in Python3, please conform as much as possible to the
[pep8]( https://pep8.org/ ) style; except where it makes no sense of course.
# Doing a release
1. Update the [changelog](https://0xacab.org/jvoisin/mat2/blob/master/CHANGELOG.md)
2. Update the version in the [mat2](https://0xacab.org/jvoisin/mat2/blob/master/mat2) file
3. Update the version in the [setup.py](https://0xacab.org/jvoisin/mat2/blob/master/setup.py) file
4. Update the version in the [pyproject.toml](https://0xacab.org/jvoisin/mat2/blob/master/yproject.toml) file
5. Update the version and date in the [man page](https://0xacab.org/jvoisin/mat2/blob/master/doc/mat2.1)
6. Commit the modified files
7. Create a tag with `git tag -s $VERSION`
8. Push the commit with `git push origin master`
9. Push the tag with `git push --tags`
10. Download the gitlab archive of the release
11. Diff it against the local copy
12. If there is no difference, sign the archive with `gpg --armor --detach-sign mat2-$VERSION.tar.xz`
13. Upload the signature on Gitlab's [tag page](https://0xacab.org/jvoisin/mat2/tags) and add the changelog there
14. Announce the release on the [mailing list](https://mailman.boum.org/listinfo/mat-dev)
15. Sign'n'upload the new version on pypi with `python3 setup.py sdist bdist_wheel` then `twine upload -s dist/*`
16. Do the secret release dance
# Python ecosystem
If you feel like running arbitrary code downloaded over the
internet (pypi doesn't support gpg signatures [anymore](https://github.com/pypa/python-packaging-user-guide/pull/466)),
mat2 is [available on pypi](https://pypi.org/project/mat2/), and can be
installed like this:
```
pip3 install mat2
```
# GNU/Linux
## Optional dependencies
When [bubblewrap](https://github.com/projectatomic/bubblewrap) is
installed, mat2 uses it to sandbox any external processes it invokes.
## Arch Linux
Thanks to [kpcyrd](https://archlinux.org/packages/?maintainer=kpcyrd), there is an package available on
[Arch linux's AUR](https://archlinux.org/packages/extra/any/mat2/).
## Debian
There is a package available in [Debian](https://packages.debian.org/search?keywords=mat2&searchon=names&section=all) and you can install mat2 with:
```
apt install mat2
```
## Fedora
Thanks to [atenart](https://ack.tf/), there is a package available on
[Fedora's copr]( https://copr.fedorainfracloud.org/coprs/atenart/mat2/ ).
First you need to enable mat2's copr:
```
dnf -y copr enable atenart/mat2
```
Then you can install mat2:
```
dnf -y install mat2
```
## Gentoo
mat2 is available in the [torbrowser overlay](https://github.com/MeisterP/torbrowser-overlay).
# OSX
## Homebrew
mat2 is [available on homebrew](https://formulae.brew.sh/formula/mat2):
```
brew install mat2
```
## MacPorts
mat2 is [available on MacPorts](https://ports.macports.org/port/mat2/):
```
port install mat2
```
``` ```
_____ _____ _____ ___ _____ _____ _____ ___
| | _ |_ _|_ | Keep you data, | | _ |_ _|_ | Keep your data,
| | | | | | | | _| trash your meta! | | | | |_| | | | | _| trash your meta!
|_|_|_|__|__| |_| |___| |_|_|_|_| |_| |_| |___|
``` ```
[![pipeline status](https://0xacab.org/jvoisin/mat2/badges/master/pipeline.svg)](https://0xacab.org/jvoisin/mat2/commits/master) # Metadata and privacy
[![coverage report](https://0xacab.org/jvoisin/mat2/badges/master/coverage.svg)](https://0xacab.org/jvoisin/mat2/commits/master)
Metadata consist of information that characterizes data.
Metadata are used to provide documentation for data products.
In essence, metadata answer who, what, when, where, why, and how about
every facet of the data that are being documented.
Metadata within a file can tell a lot about you.
Cameras record data about when a picture was taken and what
camera was used. Office documents like PDF or Office automatically adds
author and company information to documents and spreadsheets.
Maybe you don't want to disclose those information.
This is precisely the job of mat2: getting rid, as much as possible, of
metadata.
mat2 provides:
- a library called `libmat2`;
- a command line tool called `mat2`,
- a service menu for Dolphin, KDE's default file manager
If you prefer a regular graphical user interface, you might be interested in
[Metadata Cleaner](https://metadatacleaner.romainvigier.fr/), which is using
`mat2` under the hood.
# Requirements # Requirements
- `python3-mutagen` for audio support - `python3-mutagen` for audio support
- `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support - `python3-gi-cairo` and `gir1.2-poppler-0.18` for PDF support
- `gir1.2-gdkpixbuf-2.0` for images support - `gir1.2-gdkpixbuf-2.0` for images support
- `gir1.2-rsvg-2.0` for svg support
- `FFmpeg`, optionally, for video support
- `libimage-exiftool-perl` for everything else - `libimage-exiftool-perl` for everything else
- `bubblewrap`, optionally, for sandboxing
Please note that mat2 requires at least Python3.5.
Please note that MAT2 requires at least Python3.5, meaning that it # Requirements setup on macOS (OS X) using [Homebrew](https://brew.sh/)
doesn't run on [Debian Jessie](Stretc://packages.debian.org/jessie/python3),
# Run the testsuite ```bash
brew install exiftool cairo pygobject3 poppler gdk-pixbuf librsvg ffmpeg
```
# Running the test suite
```bash ```bash
$ python3 -m unittest discover -v $ python3 -m unittest discover -v
``` ```
# Related softwares And if you want to see the coverage:
```bash
$ python3-coverage run --branch -m unittest discover -s tests/
$ python3-coverage report --include -m --include /libmat2/*'
```
# How to use mat2
```
usage: mat2 [-h] [-V] [--unknown-members policy] [--inplace] [--no-sandbox]
[-v] [-l] [--check-dependencies] [-L | -s]
[files [files ...]]
Metadata anonymisation toolkit 2
positional arguments:
files the files to process
optional arguments:
-h, --help show this help message and exit
-V, --verbose show more verbose status information
--unknown-members policy
how to handle unknown members of archive-style files
(policy should be one of: abort, omit, keep) [Default:
abort]
--inplace clean in place, without backup
--no-sandbox Disable bubblewrap's sandboxing
-v, --version show program's version number and exit
-l, --list list all supported fileformats
--check-dependencies check if mat2 has all the dependencies it needs
-L, --lightweight remove SOME metadata
-s, --show list harmful metadata detectable by mat2 without
removing them
```
Note that mat2 **will not** clean files in-place, but will produce, for
example, with a file named "myfile.png" a cleaned version named
"myfile.cleaned.png".
## Web interface
It's possible to run mat2 as a web service, via
[mat2-web](https://0xacab.org/jvoisin/mat2-web).
If you're using WordPress, you might be interested in [wp-mat](https://git.autistici.org/noblogs/wp-mat)
and [wp-mat-server](https://git.autistici.org/noblogs/wp-mat-server).
## Desktop GUI
- The first iteration of [MAT](http://mat.boum.org) For GNU/Linux desktops, it's possible to use the
[Metadata Cleaner](https://gitlab.com/rmnvgr/metadata-cleaner) GTK application.
# Supported formats
The following formats are supported: avi, bmp, css, epub/ncx, flac, gif, jpeg,
m4a/mp2/mp3/…, mp4, odc/odf/odg/odi/odp/ods/odt/…, off/opus/oga/spx/…, pdf,
png, ppm, pptx/xlsx/docx/…, svg/svgz/…, tar/tar.gz/tar.bz2/tar.xz/…, tiff,
torrent, wav, wmv, zip, …
# Notes about detecting metadata
While mat2 is doing its very best to display metadata when the `--show` flag is
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
show any. There is no reliable way to detect every single possible metadata for
complex file formats.
This is why you shouldn't rely on metadata's presence to decide if your file must
be cleaned or not.
# Notes about the lightweight mode
By default, mat2 might alter a bit the data of your files, in order to remove
as much metadata as possible. For example, texts in PDF might not be selectable anymore,
compressed images might get compressed again, …
Since some users might be willing to trade some metadata's presence in exchange
of the guarantee that mat2 won't modify the data of their files, there is the
`-L` flag that precisely does that.
# Related software
- The first iteration of [MAT](https://mat.boum.org)
- [Exiftool](https://sno.phy.queensu.ca/~phil/exiftool/mat) - [Exiftool](https://sno.phy.queensu.ca/~phil/exiftool/mat)
- [pdf-redact-tools](https://github.com/firstlookmedia/pdf-redact-tools), that - [pdf-redact-tools](https://github.com/firstlookmedia/pdf-redact-tools), that
try to deal with *printer dots* too. tries to deal with *printer dots* too.
- [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes - [pdfparanoia](https://github.com/kanzure/pdfparanoia), that removes
watermarks from PDF. watermarks from PDF.
- [Scrambled Exif](https://f-droid.org/packages/com.jarsilio.android.scrambledeggsif/),
an open-source Android application to remove metadata from pictures.
- [Dangerzone](https://dangerzone.rocks/), designed to sanitize harmful documents
into harmless ones.
# Contact
If possible, use the [issues system](https://0xacab.org/jvoisin/mat2/issues)
or the [mailing list](https://www.autistici.org/mailman/listinfo/mat-dev)
Should a more private contact be needed (eg. for reporting security issues),
you can email Julien (jvoisin) Voisin at `julien.voisin+mat2@dustri.org`,
using the gpg key `9FCDEE9E1A381F311EA62A7404D041E8171901CC`.
# Donations
If you want to donate some money, please give it to [Tails]( https://tails.boum.org/donate/?r=contribute ).
# License
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org>
Copyright 2016 Marie-Rose for mat2's logo
The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3,
and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx
The `narrated_powerpoint_presentation.pptx` file is in the public domain.
# Thanks
mat2 wouldn't exist without:
- the [Google Summer of Code](https://summerofcode.withgoogle.com/);
- the fine people from [Tails]( https://tails.boum.org);
- friends
Many thanks to them!
data/mat2.png

3.13 KiB | W: 0px | H: 0px

data/mat2.png

28.2 KiB | W: 0px | H: 0px

data/mat2.png
data/mat2.png
data/mat2.png
data/mat2.png
  • 2-up
  • Swipe
  • Onion skin
This diff is collapsed.
# Exiftool
mat2 is in fact using exiftool to extract metadata from files,
but not to remove them. The previous iteration of mat2, MAT,
was using exiftool to remove metadata, which lead to several cases where
they weren't correctly removed, if at all.
For example, [Exiftool's documentation](https://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PDF.html)
states the following with regard to PDF:
> All metadata edits are reversible. While this would normally be considered an
advantage, it is a potential security problem because old information is never
actually deleted from the file.
To remove metadata, mat2 usually re-render the file completely, eliminating
all possible original metadata. See the `implementation_notes.md` file for
details.
# jpegoptim, optipng, …
While designed to reduce as much as possible the size of pictures,
those software can be used to remove metadata. They usually have very good
support for a single picture format, and can be used in place of mat2 for them.
# PDF Redact Tools
[PDF Redact Tools](https://github.com/firstlookmedia/pdf-redact-tools) is
a software developed by the people from [First Look
Media](https://firstlook.media/), the entity behind, amongst other things,
[The Intercept](https://theintercept.com/).
The tool uses roughly the same approach than mat2 to deal with PDF,
which is unfortunately the only fileformat that it does support.
It's interesting to note that it has counter-measures against
[yellow dots](https://en.wikipedia.org/wiki/Machine_Identification_Code),
a capacity that mat2 [doesn't possess yet](https://0xacab.org/jvoisin/mat2/issues/43).
# Exiv2
[Exiv2](https://www.exiv2.org/) was considered for mat2,
but it currently [misses a lot of metadata](https://0xacab.org/jvoisin/mat2/issues/85)
# Others non open source software/online service
There are a lot of closed-source software and online service claiming to remove
metadata from your files, but since there is no way to actually verify that
they're effectively removing them, let alone adding unique markers, they
shouldn't be used.
Implementation notes Implementation notes
==================== ====================
Lightweight cleaning mode
-------------------------
Due to *popular* request, mat2 is providing a *lightweight* cleaning mode,
that only cleans the superficial metadata of your file, but not
the ones that might be in **embedded** resources. Like for example,
images in a PDF or an office document.
Revisions handling
------------------
Revisions are handled according to the principle of least astonishment: they
are entirely removed.
- Either the users aren't aware of the revisions, are thus they should be
deleted. For example journalists that are editing a document to erase
mentions sources mentions.
- Or they are aware of it, and will likely not expect mat2 to be able to keep
the revisions, that are basically traces about how, when and who edited the
document.
Race conditions
---------------
mat2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. mat2 doesn't take any measure to
ensure that the file is not changed between the time the parser is
instantiated, and the call to clean or show the metadata.
Symlink attacks Symlink attacks
--------------- ---------------
MAT2 output predictable filenames (like yourfile.jpg.cleaned). mat2 output predictable filenames (like yourfile.jpg.cleaned).
This may lead to symlink attack. Please check if you OS prevent This may lead to symlink attack. Please check if you OS prevent
against them against them
Archives handling Archives handling
----------------- -----------------
MAT2 doesn't support archives yet, because we haven't found an usable way to ask the user By default, when cleaning a non-support file format in an archive,
what to do when a non-supported files are encountered. mat2 will abort with a detailed error message.
While strongly discouraged, it's possible to override this behaviour to force
the exclusion, or inclusion of unknown files into the cleaned archive.
While Python's [zipfile](https://docs.python.org/3/library/zipfile.html) module
provides *safe* way to extract members of a zip archive, the
[tarfile](https://docs.python.org/3/library/tarfile.html) one doesn't,
meaning that it's up to mat2 to implement safety checks. Currently,
it defends against path-traversal, both relative and absolute,
symlink-related attacks, setuid/setgid attacks, duplicate members, block and
char devices, … but there might still be dragons lurking there.
PDF handling PDF handling
------------ ------------
...@@ -23,10 +65,10 @@ didn't remove any *deep metadata*, like the ones in embedded pictures. This was ...@@ -23,10 +65,10 @@ didn't remove any *deep metadata*, like the ones in embedded pictures. This was
on of the reason MAT was abandoned: the absence of satisfying solution to on of the reason MAT was abandoned: the absence of satisfying solution to
handle PDF. But apparently, people are ok with [pdf redact handle PDF. But apparently, people are ok with [pdf redact
tools](https://github.com/firstlookmedia/pdf-redact-tools), that simply tools](https://github.com/firstlookmedia/pdf-redact-tools), that simply
transform the PDF into images. So this is what's MAT2 is doing too. transform the PDF into images. So this is what's mat2 is doing too.
Of course, it would be possible to detect images in PDf file, and process them Of course, it would be possible to detect images in PDf file, and process them
with MAT2, but since a PDF can contain a lot of things, like images, videos, with mat2, but since a PDF can contain a lot of things, like images, videos,
javascript, pdf, blobs, … this is the easiest and safest way to clean them. javascript, pdf, blobs, … this is the easiest and safest way to clean them.
Images handling Images handling
...@@ -35,3 +77,11 @@ Images handling ...@@ -35,3 +77,11 @@ Images handling
When possible, images are handled like PDF: rendered on a surface, then saved When possible, images are handled like PDF: rendered on a surface, then saved
to the filesystem. This ensures that every metadata is removed. to the filesystem. This ensures that every metadata is removed.
XML attacks
-----------
Since our threat model conveniently excludes files crafted to specifically
bypass mat2, fileformats containing harmful XML are out of our scope.
But since mat2 is using [etree](https://docs.python.org/3/library/xml.html#xml-vulnerabilities)
to process XML, it's "only" vulnerable to DoS, and not memory corruption:
odds are that the user will notice that the cleaning didn't succeed.
.TH mat2 "1" "January 2025" "mat2 0.13.5" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
.SH SYNOPSIS
\fBmat2\fR [\-h] [\-v] [\-l] [\-V] [-s | -L] [\fIfiles\fR [\fIfiles ...\fR]]
.SH DESCRIPTION
.B mat2
removes metadata from various fileformats. It supports a wide variety of file
formats, audio, office, images, …
Careful, mat2 does not clean files in-place, instead, it will produce a file with the word
"cleaned" between the filename and its extension, for example "filename.cleaned.png"
for a file named "filename.png".
.SH OPTIONS
.SS "positional arguments:"
.TP
\fBfiles\fR
the files to process
.SS "optional arguments:"
.TP
\fB\-h\fR, \fB\-\-help\fR
show this help message and exit
.TP
\fB\-v\fR, \fB\-\-version\fR
show program's version number and exit
.TP
\fB\-l\fR, \fB\-\-list\fR
list all supported fileformats
.TP
\fB\-\-check\-dependencies\fR
check if mat2 has all the dependencies it needs
.TP
\fB\-V\fR, \fB\-\-verbose\fR
show more verbose status information
.TP
\fB\-\-unknown-members\fR \fIpolicy\fR
how to handle unknown members of archive-style files (policy should be one of: abort, omit, keep)
.TP
\fB\-s\fR, \fB\-\-show\fR
list harmful metadata detectable by mat2 without removing them
.TP
\fB\-L\fR, \fB\-\-lightweight\fR
remove SOME metadata
.TP
\fB\--no-sandbox\fR
disable bubblewrap's sandboxing
.TP
\fB\--inplace\fR
clean in place, without backup
.SH EXAMPLES
To remove all the metadata from a PDF file:
.PP
.nf
.RS
mat2 ./myfile.pdf
.RE
.fi
.PP
.SH NOTES ABOUT METADATA
While mat2 is doing its very best to display metadata when the --show flag is
passed, it doesn't mean that a file is clean from any metadata if mat2 doesn't
show any. There is no reliable way to detect every single possible metadata for
complex file formats.
.PP
This is why you shouldn't rely on metadata's presence to decide if your file must
be cleaned or not.
.PP
Moreover, mat2 goes to great lengths to make sure that as much metadata as
possible are removed. This might sometimes result in a loss of quality of the
processed files. For example, textual based pdf file converted into image based
one means that it'll be no longer possible to select text in them. If you're
experiencing this, you might want to give the lightweight cleaning mode a try,
but keep in mind by doing so, some metadata \fBwon't be cleaned\fR.
.SH BUGS
While mat2 does its very best to remove every single metadata,
it's still in beta, and \fBsome\fR might remain. Should you encounter
some issues, check the bugtracker: https://0xacab.org/jvoisin/mat2/issues
.PP
Please use accordingly and be careful.
.SH AUTHOR
This software was made by Julien (jvoisin) Voisin with the support of the Tails project.
.SH COPYRIGHT
This software is released on LGPLv3.
.SH "SEE ALSO"
.BR exiftool (1p)
.BR pdf-redact-tools (1)
Threat Model Threat Model
============ ============
The Metadata Anonymisation Toolkit 2 adversary has a number The Metadata Anonymisation Toolkit 2 adversary has a number
of goals, capabilities, and counter-attack types that can be of goals, capabilities, and counter-attack types that can be
used to guide us towards a set of requirements for the MAT2. used to guide us towards a set of requirements for the mat2.
This is an overhaul of MAT's (the first iteration of the software) one. This is an overhaul of MAT's (the first iteration of the software) one.
...@@ -13,17 +14,19 @@ Mat only removes standard metadata from your files, it does _not_: ...@@ -13,17 +14,19 @@ Mat only removes standard metadata from your files, it does _not_:
- anonymise their content (the substance and the form) - anonymise their content (the substance and the form)
- handle watermarking - handle watermarking
- handle steganography - handle steganography nor homoglyphs
- handle stylometry - handle stylometry
- handle any non-standard metadata field/system - handle any non-standard metadata field/system
- handle file-system related metadata
If you really want to be anonymous format that does not contain any If you really want to be anonymous format that does not contain any
metadata, or better : use plain-text ASCII without trailing spaces. metadata, or better : use plain-text ASCII without trailing spaces.
And as usual, think before clicking.
And as usual, think twice before clicking.
Adversary Adversary
------------ ---------
* Goals: * Goals:
...@@ -40,17 +43,18 @@ Adversary ...@@ -40,17 +43,18 @@ Adversary
to directly identify the author and/or source, his next to directly identify the author and/or source, his next
goal is to determine the source of the equipment used goal is to determine the source of the equipment used
to produce, copy, and transmit the document. This can to produce, copy, and transmit the document. This can
include the model of camera used to take a photo, or include the model of camera used to take a photo or a film,
which software was used to produce an office document. which software was used to produce an office document, …
* Adversary Capabilities - Positioning * Adversary Capabilities - Positioning
- The adversary created the document specifically for this - The adversary created the document specifically for this
user. This is the strongest position for the adversary to user. This is the strongest position for the adversary to
have. In this case, the adversary is capable of inserting have. In this case, the adversary is capable of inserting
arbitrary, custom watermarks specifically for tracking arbitrary, custom watermarks specifically for tracking
the user. In general, MAT cannot defend against this the user. In general, mat2 cannot defend against this
adversary, but we list it for completeness. adversary, but we list it for completeness' sake.
- The adversary created the document for a group of users. - The adversary created the document for a group of users.
In this case, the adversary knows that they attempted to In this case, the adversary knows that they attempted to
...@@ -58,30 +62,39 @@ Adversary ...@@ -58,30 +62,39 @@ Adversary
or may not have watermarked the document for these or may not have watermarked the document for these
users, but they certainly know the format used. users, but they certainly know the format used.
- The adversary did not create the document, the weakest - The adversary did not create the document, the weakest
position for the adversary to have. The file format is (most of the time) position for the adversary to have. The file format is
standard, nothing custom is added: MAT (most of the time) standard, nothing custom is added:
should be able to remove all meta-information from the mat2 must be able to remove all metadata from the file.
file.
Requirements Requirements
--------------- ------------
* Processing * Processing
- The MAT2 *should* avoid interactions with information.
- mat2 *should* avoid interactions with information.
Its goal is to remove metadata, and the user is solely Its goal is to remove metadata, and the user is solely
responsible for the information of the file. responsible for the information of the file.
- The MAT2 *must* warn when encountering an unknown - mat2 *must* warn when encountering an unknown
format. For example, in a zipfile, if MAT encounters an format. For example, in a zipfile, if mat2 encounters an
unknown format, it should warn the user, and ask if the unknown format, it should warn the user, and ask if the
file should be added to the anonymised archive that is file should be added to the anonymised archive that is
produced. produced.
- The MAT2 *must* not add metadata, since its purpose is to - mat2 *must* not add metadata, since its purpose is to
anonymise files: every added items of metadata decreases anonymise files: every added items of metadata decreases
anonymity. anonymity.
- The MAT2 *should* handle unknown/hidden metadata fields, - mat2 *should* handle unknown/hidden metadata fields,
like proprietary extensions of open formats. like proprietary extensions of open formats.
- mat2 *must not* fail silently. Upon failure,
mat2 *must not* modify the file in any way.
- mat2 *might* leak the fact that mat2 was used on the file,
since it might be uncommon for some file formats to come
without any kind of metadata, an adversary might suspect that
the user used mat2 on certain files.
Dolphin integration
===================
Thanks to [Miguel Marco](https://riemann.unizar.es/~mmarco/), here is an neat
integration for [Dolphin](https://kde.org/applications/system/org.kde.dolphin),
the KDE file manager:
1. Add the `mat2.desktop` file either in
- `/usr/share/kservices5/ServiceMenus/` to install it globally
- `~/.local/share/kservices5/ServiceMenus/` for a specific user
2. Run `kbuildsycoca5` to update the corresponding database
3. Enjoy your new contextual menu to remove metadata from your files!
[Desktop Entry]
X-KDE-ServiceTypes=KonqPopupMenu/Plugin
MimeType=application/pdf;application/vnd.oasis.opendocument.chart;application/vnd.oasis.opendocument.formula;application/vnd.oasis.opendocument.graphics;application/vnd.oasis.opendocument.image;application/vnd.oasis.opendocument.presentation;application/vnd.oasis.opendocument.spreadsheet;application/vnd.oasis.opendocument.text;application/vnd.openxmlformats-officedocument.presentationml.presentation;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;application/vnd.openxmlformats-officedocument.wordprocessingml.document;application/x-bittorrent;application/zip;audio/flac;audio/mpeg;audio/ogg;audio/x-flac;image/jpeg;image/png;image/tiff;image/x-ms-bmp;text/plain;video/mp4;video/x-msvideo;
Actions=cleanMetadata;
Type=Service
[Desktop Action cleanMetadata]
Name=Clean metadata
Name[de]=Metadaten löschen
Name[es]=Limpiar metadatos
Icon=/usr/share/icons/hicolor/scalable/apps/mat2.svg
Exec=kdialog --yesno "$( mat2 -s %F )" --title "Clean Metadata?" && mat2 %U
Exec[de]=kdialog --yesno "$( mat2 -s %F )" --title "Metadaten löschen?" && mat2 %U
#!/usr/bin/env python3
import enum
import importlib
from typing import Dict
from . import exiftool, video
# A set of extension that aren't supported, despite matching a supported mimetype
UNSUPPORTED_EXTENSIONS = {
'.asc',
'.bat',
'.brf',
'.c',
'.h',
'.ksh',
'.pl',
'.pot',
'.rdf',
'.srt',
'.wsdl',
'.xpdl',
'.xsd',
'.xsl',
}
DEPENDENCIES = {
'Cairo': {
'module': 'cairo',
'required': True,
},
'PyGobject': {
'module': 'gi',
'required': True,
},
'GdkPixbuf from PyGobject': {
'module': 'gi.repository.GdkPixbuf',
'required': True,
},
'Poppler from PyGobject': {
'module': 'gi.repository.Poppler',
'required': True,
},
'GLib from PyGobject': {
'module': 'gi.repository.GLib',
'required': True,
},
'Mutagen': {
'module': 'mutagen',
'required': True,
},
}
CMD_DEPENDENCIES = {
'Exiftool': {
'cmd': exiftool._get_exiftool_path,
'required': False,
},
'Ffmpeg': {
'cmd': video._get_ffmpeg_path,
'required': False,
},
}
def check_dependencies() -> Dict[str, Dict[str, bool]]:
ret: Dict[str, Dict] = dict()
for key, value in DEPENDENCIES.items():
ret[key] = {
'found': True,
'required': value['required'],
}
try:
importlib.import_module(value['module']) # type: ignore
except ImportError: # pragma: no cover
ret[key]['found'] = False
for k, v in CMD_DEPENDENCIES.items():
ret[k] = {
'found': True,
'required': v['required'],
}
try:
v['cmd']() # type: ignore
except RuntimeError: # pragma: no cover
ret[k]['found'] = False
return ret
@enum.unique
class UnknownMemberPolicy(enum.Enum):
ABORT = 'abort'
OMIT = 'omit'
KEEP = 'keep'
import abc
import os
import re
from typing import Union, Set, Dict
class AbstractParser(abc.ABC):
""" This is the base class of every parser.
It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
"""
meta_list: Set[str] = set()
mimetypes: Set[str] = set()
def __init__(self, filename: str) -> None:
"""
:raises ValueError: Raised upon an invalid file
"""
if re.search('^[a-z0-9./]', filename) is None:
# Some parsers are calling external binaries,
# this prevents shell command injections
filename = os.path.join('.', filename)
self.filename = filename
fname, extension = os.path.splitext(filename)
# Special case for tar.gz, tar.bz2, … files
if fname.endswith('.tar') and len(fname) > 4:
fname, extension = fname[:-4], '.tar' + extension
self.output_filename = fname + '.cleaned' + extension
self.lightweight_cleaning = False
self.sandbox = True
@abc.abstractmethod
def get_meta(self) -> Dict[str, Union[str, Dict]]:
"""Return all the metadata of the current file
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
@abc.abstractmethod
def remove_all(self) -> bool:
"""
Remove all the metadata of the current file
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
This diff is collapsed.
import mimetypes
import os
import shutil
import tempfile
from typing import Union, Dict
import mutagen
from . import abstract, parser_factory, video
class MutagenParser(abstract.AbstractParser):
def __init__(self, filename):
super().__init__(filename)
try:
if mutagen.File(self.filename) is None:
raise ValueError
except mutagen.MutagenError:
raise ValueError
def get_meta(self) -> Dict[str, Union[str, Dict]]:
f = mutagen.File(self.filename)
if f.tags:
return {k: ', '.join(map(str, v)) for k, v in f.tags.items()}
return {}
def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename)
try:
f.delete()
f.save()
except mutagen.MutagenError:
raise ValueError
return True
class MP3Parser(MutagenParser):
mimetypes = {'audio/mpeg', }
def get_meta(self) -> Dict[str, Union[str, Dict]]:
metadata: Dict[str, Union[str, Dict]] = dict()
meta = mutagen.File(self.filename).tags
if not meta:
return metadata
for key in meta:
if isinstance(key, tuple):
metadata[key[0]] = key[1]
continue
if not hasattr(meta[key], 'text'): # pragma: no cover
continue
metadata[key.rstrip(' \t\r\n\0')] = ', '.join(map(str, meta[key].text))
return metadata
class OGGParser(MutagenParser):
mimetypes = {'audio/ogg', }
class FLACParser(MutagenParser):
mimetypes = {'audio/flac', 'audio/x-flac'}
def remove_all(self) -> bool:
shutil.copy(self.filename, self.output_filename)
f = mutagen.File(self.output_filename)
f.clear_pictures()
f.delete()
f.save(deleteid3=True)
return True
def get_meta(self) -> Dict[str, Union[str, Dict]]:
meta = super().get_meta()
for num, picture in enumerate(mutagen.File(self.filename).pictures):
name = picture.desc if picture.desc else 'Cover %d' % num
extension = mimetypes.guess_extension(picture.mime)
if extension is None: # pragma: no cover
meta[name] = 'harmful data'
continue
_, fname = tempfile.mkstemp()
fname = fname + extension
with open(fname, 'wb') as f:
f.write(picture.data)
p, _ = parser_factory.get_parser(fname) # type: ignore
if p is None:
raise ValueError
p.sandbox = self.sandbox
# Mypy chokes on ternaries :/
meta[name] = p.get_meta() if p else 'harmful data' # type: ignore
os.remove(fname)
return meta
class WAVParser(video.AbstractFFmpegParser):
mimetypes = {'audio/x-wav', }
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
'Duration', 'Encoding', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate',
'FileModifyDate', 'FileName', 'FilePermissions',
'FileSize', 'FileType', 'FileTypeExtension',
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
}
class AIFFParser(video.AbstractFFmpegParser):
mimetypes = {'audio/aiff', 'audio/x-aiff'}
meta_allowlist = {'AvgBytesPerSec', 'BitsPerSample', 'Directory',
'Duration', 'Encoding', 'ExifToolVersion',
'FileAccessDate', 'FileInodeChangeDate',
'FileModifyDate', 'FileName', 'FilePermissions',
'FileSize', 'FileType', 'FileTypeExtension',
'MIMEType', 'NumChannels', 'SampleRate', 'SourceFile',
'NumSampleFrames', 'SampleSize',
}