mat2 6.22 KB
Newer Older
1
#!/usr/bin/env python3
2

jvoisin's avatar
jvoisin committed
3
import os
jvoisin's avatar
jvoisin committed
4
from typing import Tuple, Generator, List, Union
5
import sys
jvoisin's avatar
jvoisin committed
6
import mimetypes
jvoisin's avatar
jvoisin committed
7
import argparse
dkg's avatar
dkg committed
8
import logging
9
import unicodedata
jvoisin's avatar
jvoisin committed
10

11
try:
12 13
    from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
    from libmat2 import check_dependencies, UnknownMemberPolicy
14 15 16
except ValueError as e:
    print(e)
    sys.exit(1)
jvoisin's avatar
jvoisin committed
17

jvoisin's avatar
jvoisin committed
18
__version__ = '0.8.0'
jvoisin's avatar
jvoisin committed
19

20 21
# Make pyflakes happy
assert Tuple
jvoisin's avatar
jvoisin committed
22
assert Union
23

jvoisin's avatar
jvoisin committed
24 25
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)

26

27
def __check_file(filename: str, mode: int = os.R_OK) -> bool:
28
    if not os.path.exists(filename):
29
        print("[-] %s doesn't exist." % filename)
30 31
        return False
    elif not os.path.isfile(filename):
jvoisin's avatar
jvoisin committed
32 33 34 35 36 37
        print("[-] %s is not a regular file." % filename)
        return False
    elif not os.access(filename, mode):
        print("[-] %s is not readable and writeable." % filename)
        return False
    return True
jvoisin's avatar
jvoisin committed
38

39

40
def create_arg_parser() -> argparse.ArgumentParser:
jvoisin's avatar
jvoisin committed
41
    parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
42
    parser.add_argument('files', nargs='*', help='the files to process')
jvoisin's avatar
jvoisin committed
43
    parser.add_argument('-v', '--version', action='version',
jvoisin's avatar
jvoisin committed
44
                        version='MAT2 %s' % __version__)
45
    parser.add_argument('-l', '--list', action='store_true',
jvoisin's avatar
jvoisin committed
46
                        help='list all supported fileformats')
47
    parser.add_argument('--check-dependencies', action='store_true',
jvoisin's avatar
jvoisin committed
48
                        help='check if MAT2 has all the dependencies it needs')
dkg's avatar
dkg committed
49 50
    parser.add_argument('-V', '--verbose', action='store_true',
                        help='show more verbose status information')
51
    parser.add_argument('--unknown-members', metavar='policy', default='abort',
52
                        help='how to handle unknown members of archive-style files (policy should' +
53
                        ' be one of: %s) [Default: abort]' % ', '.join(p.value for p in UnknownMemberPolicy))
54

jvoisin's avatar
jvoisin committed
55

56
    info = parser.add_mutually_exclusive_group()
jvoisin's avatar
jvoisin committed
57
    info.add_argument('-s', '--show', action='store_true',
58
                      help='list harmful metadata detectable by MAT2 without removing them')
59 60
    info.add_argument('-L', '--lightweight', action='store_true',
                      help='remove SOME metadata')
jvoisin's avatar
jvoisin committed
61 62
    return parser

63

jvoisin's avatar
jvoisin committed
64
def show_meta(filename: str):
jvoisin's avatar
jvoisin committed
65 66 67
    if not __check_file(filename):
        return

68
    p, mtype = parser_factory.get_parser(filename)  # type: ignore
jvoisin's avatar
jvoisin committed
69
    if p is None:
jvoisin's avatar
jvoisin committed
70
        print("[-] %s's format (%s) is not supported" % (filename, mtype))
jvoisin's avatar
jvoisin committed
71
        return
72
    __print_meta(filename, p.get_meta())
73 74


75
def __print_meta(filename: str, metadata: dict, depth: int = 1):
76
    padding = " " * depth*2
77
    if not metadata:
78
        print(padding + "No metadata found")
79 80
        return

81 82
    print("[%s] Metadata for %s:" % ('+'*depth, filename))

jvoisin's avatar
jvoisin committed
83
    for (k, v) in sorted(metadata.items()):
84
        if isinstance(v, dict):
85 86
            __print_meta(k, v, depth+1)
            continue
87 88 89 90 91 92 93 94 95

        # Remove control characters
        # We might use 'Cc' instead of 'C', but better safe than sorry
        # https://www.unicode.org/reports/tr44/#GC_Values_Table
        try:
            v = ''.join(ch for ch in v if not unicodedata.category(ch).startswith('C'))
        except TypeError:
            pass  # for things that aren't iterable

96 97 98 99
        try:  # FIXME this is ugly.
            print(padding + "  %s: %s" % (k, v))
        except UnicodeEncodeError:
            print(padding + "  %s: harmful content" % k)
100

jvoisin's avatar
jvoisin committed
101

102
def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy) -> bool:
103
    if not __check_file(filename, os.R_OK):
jvoisin's avatar
jvoisin committed
104
        return False
jvoisin's avatar
jvoisin committed
105

106
    p, mtype = parser_factory.get_parser(filename)  # type: ignore
jvoisin's avatar
jvoisin committed
107 108
    if p is None:
        print("[-] %s's format (%s) is not supported" % (filename, mtype))
109
        return False
110
    p.unknown_member_policy = policy
111
    p.lightweight_cleaning = is_lightweight
112 113 114 115 116 117 118

    try:
        return p.remove_all()
    except RuntimeError as e:
        print("[-] %s can't be cleaned: %s" % (filename, e))
    return False

jvoisin's avatar
jvoisin committed
119

120

121
def show_parsers():
122
    print('[+] Supported formats:')
jvoisin's avatar
jvoisin committed
123 124
    formats = set()  # Set[str]
    for parser in parser_factory._get_parsers():  # type: ignore
125
        for mtype in parser.mimetypes:
jvoisin's avatar
jvoisin committed
126
            extensions = set()  # Set[str]
127
            for extension in mimetypes.guess_all_extensions(mtype):
128
                if extension not in UNSUPPORTED_EXTENSIONS:
129 130 131 132 133
                    extensions.add(extension)
            if not extensions:
                # we're not supporting a single extension in the current
                # mimetype, so there is not point in showing the mimetype at all
                continue
134
            formats.add('  - %s (%s)' % (mtype, ', '.join(extensions)))
jvoisin's avatar
jvoisin committed
135
    print('\n'.join(sorted(formats)))
136

137

138
def __get_files_recursively(files: List[str]) -> Generator[str, None, None]:
139
    for f in files:
140
        if os.path.isdir(f):
141 142
            for path, _, _files in os.walk(f):
                for _f in _files:
143 144 145 146 147
                    fname = os.path.join(path, _f)
                    if __check_file(fname):
                        yield fname
        elif __check_file(f):
            yield f
148

jvoisin's avatar
jvoisin committed
149
def main() -> int:
jvoisin's avatar
jvoisin committed
150 151
    arg_parser = create_arg_parser()
    args = arg_parser.parse_args()
jvoisin's avatar
jvoisin committed
152

dkg's avatar
dkg committed
153 154 155
    if args.verbose:
        logging.basicConfig(level=logging.INFO)

156
    if not args.files:
157
        if args.list:
158 159
            show_parsers()
            return 0
160 161 162 163 164
        elif args.check_dependencies:
            print("Dependencies required for MAT2 %s:" % __version__)
            for key, value in sorted(check_dependencies().items()):
                print('- %s: %s' % (key, 'yes' if value else 'no'))
        else:
jvoisin's avatar
jvoisin committed
165
            arg_parser.print_help()
jvoisin's avatar
jvoisin committed
166
        return 0
167

jvoisin's avatar
jvoisin committed
168
    elif args.show:
jvoisin's avatar
jvoisin committed
169
        for f in __get_files_recursively(args.files):
jvoisin's avatar
jvoisin committed
170
            show_meta(f)
jvoisin's avatar
jvoisin committed
171
        return 0
jvoisin's avatar
jvoisin committed
172

173
    else:
174 175
        policy = UnknownMemberPolicy(args.unknown_members)
        if policy == UnknownMemberPolicy.KEEP:
176
            logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
jvoisin's avatar
jvoisin committed
177

jvoisin's avatar
jvoisin committed
178
        no_failure = True
jvoisin's avatar
jvoisin committed
179
        for f in __get_files_recursively(args.files):
180
            if clean_meta(f, args.lightweight, policy) is False:
jvoisin's avatar
jvoisin committed
181 182
                no_failure = False
        return 0 if no_failure is True else -1
183

jvoisin's avatar
jvoisin committed
184 185

if __name__ == '__main__':
186
    sys.exit(main())