mat2 8.01 KB
Newer Older
1
#!/usr/bin/env python3
2

jvoisin's avatar
jvoisin committed
3
import os
jvoisin's avatar
jvoisin committed
4
import shutil
jvoisin's avatar
jvoisin committed
5
from typing import Tuple, List, Union, Set
6
import sys
jvoisin's avatar
jvoisin committed
7
import mimetypes
jvoisin's avatar
jvoisin committed
8
import argparse
dkg's avatar
dkg committed
9
import logging
10
import unicodedata
jvoisin's avatar
jvoisin committed
11
import concurrent.futures
jvoisin's avatar
jvoisin committed
12

13
try:
14 15
    from libmat2 import parser_factory, UNSUPPORTED_EXTENSIONS
    from libmat2 import check_dependencies, UnknownMemberPolicy
16 17 18
except ValueError as e:
    print(e)
    sys.exit(1)
jvoisin's avatar
jvoisin committed
19

jvoisin's avatar
jvoisin committed
20
__version__ = '0.9.0'
jvoisin's avatar
jvoisin committed
21

22
# Make pyflakes happy
jvoisin's avatar
jvoisin committed
23
assert Set
24
assert Tuple
jvoisin's avatar
jvoisin committed
25
assert Union
26

jvoisin's avatar
jvoisin committed
27 28
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.WARNING)

29

30
def __check_file(filename: str, mode: int = os.R_OK) -> bool:
31
    if not os.path.exists(filename):
32
        print("[-] %s doesn't exist." % filename)
33 34
        return False
    elif not os.path.isfile(filename):
jvoisin's avatar
jvoisin committed
35 36 37
        print("[-] %s is not a regular file." % filename)
        return False
    elif not os.access(filename, mode):
38 39 40 41 42 43
        mode_str = []  # type: List[str]
        if mode & os.R_OK:
            mode_str += 'readable'
        if mode & os.W_OK:
            mode_str += 'writeable'
        print("[-] %s is not %s." % (filename, 'nor '.join(mode_str)))
jvoisin's avatar
jvoisin committed
44 45
        return False
    return True
jvoisin's avatar
jvoisin committed
46

47

48
def create_arg_parser() -> argparse.ArgumentParser:
jvoisin's avatar
jvoisin committed
49
    parser = argparse.ArgumentParser(description='Metadata anonymisation toolkit 2')
50

dkg's avatar
dkg committed
51 52
    parser.add_argument('-V', '--verbose', action='store_true',
                        help='show more verbose status information')
53
    parser.add_argument('--unknown-members', metavar='policy', default='abort',
jvoisin's avatar
jvoisin committed
54 55 56
                        help='how to handle unknown members of archive-style '
                        'files (policy should be one of: %s) [Default: abort]' %
                        ', '.join(p.value for p in UnknownMemberPolicy))
57 58
    parser.add_argument('--inplace', action='store_true',
                        help='clean in place, without backup')
59 60
    parser.add_argument('--no-sandbox', dest='sandbox', action='store_true',
                        default=False, help='Disable bubblewrap\'s sandboxing.')
61

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
    excl_group = parser.add_mutually_exclusive_group()
    excl_group.add_argument('files', nargs='*', help='the files to process',
                            default=[])
    excl_group.add_argument('-v', '--version', action='version',
                            version='MAT2 %s' % __version__)
    excl_group.add_argument('-l', '--list', action='store_true', default=False,
                            help='list all supported fileformats')
    excl_group.add_argument('--check-dependencies', action='store_true',
                            default=False,
                            help='check if MAT2 has all the dependencies it '
                            'needs')

    excl_group = parser.add_mutually_exclusive_group()
    excl_group.add_argument('-L', '--lightweight', action='store_true',
                            help='remove SOME metadata')
    excl_group.add_argument('-s', '--show', action='store_true',
                            help='list harmful metadata detectable by MAT2 '
                            'without removing them')
jvoisin's avatar
jvoisin committed
80 81 82

    return parser

83

84
def show_meta(filename: str, sandbox: bool):
jvoisin's avatar
jvoisin committed
85 86 87
    if not __check_file(filename):
        return

88
    p, mtype = parser_factory.get_parser(filename)  # type: ignore
jvoisin's avatar
jvoisin committed
89
    if p is None:
jvoisin's avatar
jvoisin committed
90
        print("[-] %s's format (%s) is not supported" % (filename, mtype))
jvoisin's avatar
jvoisin committed
91
        return
92
    p.sandbox = sandbox
93
    __print_meta(filename, p.get_meta())
94 95


96
def __print_meta(filename: str, metadata: dict, depth: int = 1):
97
    padding = " " * depth*2
98
    if not metadata:
99
        print(padding + "No metadata found in %s." % filename)
100 101
        return

102 103
    print("[%s] Metadata for %s:" % ('+'*depth, filename))

jvoisin's avatar
jvoisin committed
104
    for (k, v) in sorted(metadata.items()):
105
        if isinstance(v, dict):
106 107
            __print_meta(k, v, depth+1)
            continue
108 109 110 111 112 113 114 115 116

        # Remove control characters
        # We might use 'Cc' instead of 'C', but better safe than sorry
        # https://www.unicode.org/reports/tr44/#GC_Values_Table
        try:
            v = ''.join(ch for ch in v if not unicodedata.category(ch).startswith('C'))
        except TypeError:
            pass  # for things that aren't iterable

117 118 119 120
        try:  # FIXME this is ugly.
            print(padding + "  %s: %s" % (k, v))
        except UnicodeEncodeError:
            print(padding + "  %s: harmful content" % k)
121

jvoisin's avatar
jvoisin committed
122

123
def clean_meta(filename: str, is_lightweight: bool, inplace: bool, sandbox: bool,
124 125 126
               policy: UnknownMemberPolicy) -> bool:
    mode = (os.R_OK | os.W_OK) if inplace else os.R_OK
    if not __check_file(filename, mode):
jvoisin's avatar
jvoisin committed
127
        return False
jvoisin's avatar
jvoisin committed
128

129
    p, mtype = parser_factory.get_parser(filename)  # type: ignore
jvoisin's avatar
jvoisin committed
130 131
    if p is None:
        print("[-] %s's format (%s) is not supported" % (filename, mtype))
132
        return False
133
    p.unknown_member_policy = policy
134
    p.lightweight_cleaning = is_lightweight
135
    p.sandbox = sandbox
136 137

    try:
jvoisin's avatar
jvoisin committed
138
        logging.debug('Cleaning %s…', filename)
139
        ret = p.remove_all()
140 141 142 143
        if ret is True:
            shutil.copymode(filename, p.output_filename)
            if inplace is True:
                os.rename(p.output_filename, filename)
144
        return ret
145 146 147 148
    except RuntimeError as e:
        print("[-] %s can't be cleaned: %s" % (filename, e))
    return False

jvoisin's avatar
jvoisin committed
149

150
def show_parsers():
151
    print('[+] Supported formats:')
jvoisin's avatar
jvoisin committed
152 153
    formats = set()  # Set[str]
    for parser in parser_factory._get_parsers():  # type: ignore
154
        for mtype in parser.mimetypes:
jvoisin's avatar
jvoisin committed
155
            extensions = set()  # Set[str]
156
            for extension in mimetypes.guess_all_extensions(mtype):
157
                if extension not in UNSUPPORTED_EXTENSIONS:
158 159 160 161 162
                    extensions.add(extension)
            if not extensions:
                # we're not supporting a single extension in the current
                # mimetype, so there is not point in showing the mimetype at all
                continue
163
            formats.add('  - %s (%s)' % (mtype, ', '.join(extensions)))
jvoisin's avatar
jvoisin committed
164
    print('\n'.join(sorted(formats)))
165

166

jvoisin's avatar
jvoisin committed
167 168
def __get_files_recursively(files: List[str]) -> List[str]:
    ret = set()  # type: Set[str]
169
    for f in files:
170
        if os.path.isdir(f):
171 172
            for path, _, _files in os.walk(f):
                for _f in _files:
173 174
                    fname = os.path.join(path, _f)
                    if __check_file(fname):
jvoisin's avatar
jvoisin committed
175
                        ret.add(fname)
176
        elif __check_file(f):
jvoisin's avatar
jvoisin committed
177 178
            ret.add(f)
    return list(ret)
179

180

jvoisin's avatar
jvoisin committed
181
def main() -> int:
jvoisin's avatar
jvoisin committed
182 183
    arg_parser = create_arg_parser()
    args = arg_parser.parse_args()
jvoisin's avatar
jvoisin committed
184

dkg's avatar
dkg committed
185
    if args.verbose:
jvoisin's avatar
jvoisin committed
186
        logging.getLogger().setLevel(logging.DEBUG)
dkg's avatar
dkg committed
187

188
    if not args.files:
189
        if args.list:
190 191
            show_parsers()
            return 0
192
        elif args.check_dependencies:
193
            print("Dependencies for MAT2 %s:" % __version__)
194
            for key, value in sorted(check_dependencies().items()):
195 196
                print('- %s: %s %s' % (key, 'yes' if value['found'] else 'no',
                                       '(optional)' if not value['required'] else ''))
197
        else:
jvoisin's avatar
jvoisin committed
198
            arg_parser.print_help()
jvoisin's avatar
jvoisin committed
199
        return 0
200

jvoisin's avatar
jvoisin committed
201
    elif args.show:
jvoisin's avatar
jvoisin committed
202
        for f in __get_files_recursively(args.files):
203
            show_meta(f, args.sandbox)
jvoisin's avatar
jvoisin committed
204
        return 0
jvoisin's avatar
jvoisin committed
205

206
    else:
207
        inplace = args.inplace
208 209
        policy = UnknownMemberPolicy(args.unknown_members)
        if policy == UnknownMemberPolicy.KEEP:
210
            logging.warning('Keeping unknown member files may leak metadata in the resulting file!')
jvoisin's avatar
jvoisin committed
211

jvoisin's avatar
jvoisin committed
212
        no_failure = True
jvoisin's avatar
jvoisin committed
213 214 215 216 217 218
        files = __get_files_recursively(args.files)
        # We have to use Processes instead of Threads, since
        # we're using tempfile.mkdtemp, which isn't thread-safe.
        with concurrent.futures.ProcessPoolExecutor() as executor:
            futures = list()
            for f in files:
219
                future = executor.submit(clean_meta, f, args.lightweight,
220
                                         inplace, args.sandbox, policy)
jvoisin's avatar
jvoisin committed
221 222 223
                futures.append(future)
            for future in concurrent.futures.as_completed(futures):
                no_failure &= future.result()
jvoisin's avatar
jvoisin committed
224
        return 0 if no_failure is True else -1
225

jvoisin's avatar
jvoisin committed
226 227

if __name__ == '__main__':
228
    sys.exit(main())