Commit 80ece300 authored by jvoisin's avatar jvoisin

Remove hachoir from MAT.

This (huge) commit removes completely hachoir from MAT.
Audio files are now processed with mutagen, and images
with exiftool, since the main python imaging library (PIL)
isn't super-great to deal with metadata (and damaged/non-standard
files).

Package maintainer should change the dependencies to reflect this.
parent 3cf80e8b
......@@ -16,6 +16,7 @@ addons:
- gir1.2-poppler-0.18
- python-pdfrw
- python-gi-cairo
- python-mutagen
virtualenv:
system_site_packages: true
......@@ -25,12 +26,11 @@ install:
- pip install --user --upgrade setuptools
- pip install --user coveralls
- pip install --user codecov
- pip install --user mutagen hachoir_core hachoir_parser
- popd
- python setup.py install
script:
- coverage run --source=libmat --omit='*hachoir_editor*' setup.py test
- coverage run --source=libmat setup.py test
after_success:
- coveralls
......
......@@ -27,7 +27,6 @@ See README.security
DEPENDENCIES
============
* python2.7 (at least)
* python-hachoir-core and python-hachoir-parser
* python-pdfrw, gir-poppler and python-gi-cairo for full PDF support
* python-gi for the GUI
* shred (should be already installed)
......@@ -35,7 +34,7 @@ DEPENDENCIES
OPTIONALS DEPENDENCIES
======================
* python-mutagen: for massive audio format support
* exiftool: for _massive_ image format support
* exiftool: for image format support
USAGE
=====
......
......@@ -5,7 +5,7 @@
<mimetype>image/png</mimetype>
<support>Full</support>
<metadata>Textual metadata and date</metadata>
<method>Removal of harmful fields with hachoir.</method>
<method>Removal of harmful fields with exiftool.</method>
<remaining>None</remaining>
</format>
......@@ -15,7 +15,7 @@
<mimetype>image/jpeg</mimetype>
<support>Partial</support>
<metadata>Comments and exif/photoshop/adobe</metadata>
<method>Removal of harmful fields with hachoir.</method>
<method>Removal of harmful fields with exiftool.</method>
<remaining>Canon Raw tags</remaining>
</format>
......@@ -75,7 +75,7 @@
<mimetype>audio/mpeg</mimetype>
<support>Full</support>
<metadata>Id3</metadata>
<method>Removal of harmful fields with hachoir</method>
<method>Removal of harmful fields with exiftool</method>
<remaining>None</remaining>
</format>
......
......@@ -20,9 +20,8 @@ class GenericArchiveStripper(parser.GenericParser):
""" Represent a generic archive
"""
def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
super(GenericArchiveStripper, self).__init__(filename,
parser, mime, backup, is_writable, **kwargs)
def __init__(self, filename, mime, backup, is_writable, **kwargs):
super(GenericArchiveStripper, self).__init__(filename, mime, backup, is_writable, **kwargs)
self.compression = ''
self.add2archive = kwargs['add2archive']
self.tempdir = tempfile.mkdtemp()
......@@ -354,8 +353,8 @@ class GzipStripper(TarStripper):
""" Represent a tar.gz archive
"""
def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
super(GzipStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
def __init__(self, filename, mime, backup, is_writable, **kwargs):
super(GzipStripper, self).__init__(filename, mime, backup, is_writable, **kwargs)
self.compression = ':gz'
......@@ -363,6 +362,6 @@ class Bzip2Stripper(TarStripper):
""" Represent a tar.bz2 archive
"""
def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
super(Bzip2Stripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
def __init__(self, filename, mime, backup, is_writable, **kwargs):
super(Bzip2Stripper, self).__init__(filename, mime, backup, is_writable, **kwargs)
self.compression = ':bz2'
""" Care about audio fileformat
"""
try:
from mutagen.flac import FLAC
from mutagen.oggvorbis import OggVorbis
except ImportError:
pass
import parser
import mutagenstripper
class MpegAudioStripper(parser.GenericParser):
""" Represent mpeg audio file (mp3, ...)
"""
def _should_remove(self, field):
return field.name in ("id3v1", "id3v2")
class OggStripper(mutagenstripper.MutagenStripper):
""" Represent an ogg vorbis file
"""
def _create_mfile(self):
self.mfile = OggVorbis(self.filename)
class FlacStripper(mutagenstripper.MutagenStripper):
""" Represent a Flac audio file
"""
def _create_mfile(self):
self.mfile = FLAC(self.filename)
def remove_all(self):
""" Remove the "metadata" block from the file
"""
super(FlacStripper, self).remove_all()
self.mfile.clear_pictures()
self.mfile.save()
return True
def is_clean(self):
""" Check if the "metadata" block is present in the file
"""
return super(FlacStripper, self).is_clean() and not self.mfile.pictures
def get_meta(self):
""" Return the content of the metadata block if present
"""
metadata = super(FlacStripper, self).get_meta()
if self.mfile.pictures:
metadata['picture:'] = 'yes'
return metadata
......@@ -9,8 +9,8 @@ class ExiftoolStripper(parser.GenericParser):
""" A generic stripper class using exiftool as backend
"""
def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
super(ExiftoolStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
def __init__(self, filename, mime, backup, is_writable, **kwargs):
super(ExiftoolStripper, self).__init__(filename, mime, backup, is_writable, **kwargs)
self.allowed = {'ExifTool Version Number', 'File Name', 'Directory', 'File Size', 'File Modification Date/Time',
'File Access Date/Time', 'File Permissions', 'File Type', 'File Type Extension', 'MIME Type',
'Image Width', 'Image Height', 'Image Size', 'File Inode Change Date/Time', 'Megapixels'}
......
from field import (
EditorError, FakeField)
from typed_field import (
EditableField, EditableBits, EditableBytes,
EditableInteger, EditableString,
createEditableField)
from fieldset import EditableFieldSet, NewFieldSet, createEditor
from hachoir_core.error import HachoirError
from hachoir_core.field import joinPath, MissingField
class EditorError(HachoirError):
pass
class FakeField(object):
"""
This class have API looks similar to Field API, but objects don't contain
any value: all values are _computed_ by parent methods.
Example: FakeField(editor, "abc").size calls editor._getFieldSize("abc").
"""
is_field_set = False
def __init__(self, parent, name):
self._parent = parent
self._name = name
def _getPath(self):
return joinPath(self._parent.path, self._name)
path = property(_getPath)
def _getName(self):
return self._name
name = property(_getName)
def _getAddress(self):
return self._parent._getFieldAddress(self._name)
address = property(_getAddress)
def _getSize(self):
return self._parent.input[self._name].size
size = property(_getSize)
def _getValue(self):
return self._parent.input[self._name].value
value = property(_getValue)
def createDisplay(self):
# TODO: Returns new value if field is altered
return self._parent.input[self._name].display
display = property(createDisplay)
def _getParent(self):
return self._parent
parent = property(_getParent)
def hasValue(self):
return self._parent.input[self._name].hasValue()
def __getitem__(self, key):
# TODO: Implement this function!
raise MissingField(self, key)
def _isAltered(self):
return False
is_altered = property(_isAltered)
def writeInto(self, output):
size = self.size
addr = self._parent._getFieldInputAddress(self._name)
input = self._parent.input
stream = input.stream
if size % 8:
output.copyBitsFrom(stream, addr, size, input.endian)
else:
output.copyBytesFrom(stream, addr, size//8)
This diff is collapsed.
from hachoir_core.field import (
RawBits, Bit, Bits, PaddingBits,
RawBytes, Bytes, PaddingBytes,
GenericString, Character,
isInteger, isString)
from field import FakeField
class EditableField(FakeField):
"""
Pure virtual class used to write editable field class.
"""
_is_altered = False
def __init__(self, parent, name, value=None):
FakeField.__init__(self, parent, name)
self._value = value
def _isAltered(self):
return self._is_altered
is_altered = property(_isAltered)
def hasValue(self):
return True
def _computeSize(self):
raise NotImplementedError()
def _getValue(self):
return self._value
def _setValue(self, value):
self._value = value
def _propGetValue(self):
if self._value is not None:
return self._getValue()
else:
return FakeField._getValue(self)
def _propSetValue(self, value):
self._setValue(value)
self._is_altered = True
value = property(_propGetValue, _propSetValue)
def _getSize(self):
if self._value is not None:
return self._computeSize()
else:
return FakeField._getSize(self)
size = property(_getSize)
def _write(self, output):
raise NotImplementedError()
def writeInto(self, output):
if self._is_altered:
self._write(output)
else:
return FakeField.writeInto(self, output)
class EditableFixedField(EditableField):
"""
Editable field with fixed size.
"""
def __init__(self, parent, name, value=None, size=None):
EditableField.__init__(self, parent, name, value)
if size is not None:
self._size = size
else:
self._size = self._parent._getOriginalField(self._name).size
def _getSize(self):
return self._size
size = property(_getSize)
class EditableBits(EditableFixedField):
def __init__(self, parent, name, *args):
if args:
if len(args) != 2:
raise TypeError(
"Wrong argument count, EditableBits constructor prototype is: "
"(parent, name, [size, value])")
size = args[0]
value = args[1]
assert isinstance(value, (int, long))
else:
size = None
value = None
EditableFixedField.__init__(self, parent, name, value, size)
if args:
self._setValue(args[1])
self._is_altered = True
def _setValue(self, value):
if not (0 <= value < (1 << self._size)):
raise ValueError("Invalid value, must be in range %s..%s"
% (0, (1 << self._size) - 1))
self._value = value
def _write(self, output):
output.writeBits(self._size, self._value, self._parent.endian)
class EditableBytes(EditableField):
def _setValue(self, value):
if not value: raise ValueError(
"Unable to set empty string to a EditableBytes field")
self._value = value
def _computeSize(self):
return len(self._value) * 8
def _write(self, output):
output.writeBytes(self._value)
class EditableString(EditableField):
MAX_SIZE = {
"Pascal8": (1 << 8) - 1,
"Pascal16": (1 << 16) - 1,
"Pascal32": (1 << 32) - 1,
}
def __init__(self, parent, name, *args, **kw):
if len(args) == 2:
value = args[1]
assert isinstance(value, str) # TODO: support Unicode
elif not args:
value = None
else:
raise TypeError(
"Wrong argument count, EditableString constructor prototype is:"
"(parent, name, [format, value])")
EditableField.__init__(self, parent, name, value)
if len(args) == 2:
self._charset = kw.get('charset', None)
self._format = args[0]
if self._format in GenericString.PASCAL_FORMATS:
self._prefix_size = GenericString.PASCAL_FORMATS[self._format]
else:
self._prefix_size = 0
self._suffix_str = GenericString.staticSuffixStr(
self._format, self._charset, self._parent.endian)
self._is_altered = True
else:
orig = self._parent._getOriginalField(name)
self._charset = orig.charset
self._format = orig.format
self._prefix_size = orig.content_offset
self._suffix_str = orig.suffix_str
def _setValue(self, value):
size = len(value)
if self._format in self.MAX_SIZE and self.MAX_SIZE[self._format] < size:
raise ValueError("String is too big")
self._value = value
def _computeSize(self):
return (self._prefix_size + len(self._value) + len(self._suffix_str)) * 8
def _write(self, output):
if self._format in GenericString.SUFFIX_FORMAT:
output.writeBytes(self._value)
output.writeBytes(self._suffix_str)
elif self._format == "fixed":
output.writeBytes(self._value)
else:
assert self._format in GenericString.PASCAL_FORMATS
size = GenericString.PASCAL_FORMATS[self._format]
output.writeInteger(len(self._value), False, size, self._parent.endian)
output.writeBytes(self._value)
class EditableCharacter(EditableFixedField):
def __init__(self, parent, name, *args):
if args:
if len(args) != 3:
raise TypeError(
"Wrong argument count, EditableCharacter "
"constructor prototype is: (parent, name, [value])")
value = args[0]
if not isinstance(value, str) or len(value) != 1:
raise TypeError("EditableCharacter needs a character")
else:
value = None
EditableFixedField.__init__(self, parent, name, value, 8)
if args:
self._is_altered = True
def _setValue(self, value):
if not isinstance(value, str) or len(value) != 1:
raise TypeError("EditableCharacter needs a character")
self._value = value
def _write(self, output):
output.writeBytes(self._value)
class EditableInteger(EditableFixedField):
VALID_VALUE_SIGNED = {
8: (-(1 << 8), (1 << 8) - 1),
16: (-(1 << 15), (1 << 15) - 1),
32: (-(1 << 31), (1 << 31) - 1),
}
VALID_VALUE_UNSIGNED = {
8: (0, (1 << 8) - 1),
16: (0, (1 << 16) - 1),
32: (0, (1 << 32) - 1)
}
def __init__(self, parent, name, *args):
if args:
if len(args) != 3:
raise TypeError(
"Wrong argument count, EditableInteger constructor prototype is: "
"(parent, name, [signed, size, value])")
size = args[1]
value = args[2]
assert isinstance(value, (int, long))
else:
size = None
value = None
EditableFixedField.__init__(self, parent, name, value, size)
if args:
self._signed = args[0]
self._is_altered = True
else:
self._signed = self._parent._getOriginalField(self._name).signed
def _setValue(self, value):
if self._signed:
valid = self.VALID_VALUE_SIGNED
else:
valid = self.VALID_VALUE_UNSIGNED
minval, maxval = valid[self._size]
if not (minval <= value <= maxval):
raise ValueError("Invalid value, must be in range %s..%s"
% (minval, maxval))
self._value = value
def _write(self, output):
output.writeInteger(
self.value, self._signed, self._size // 8, self._parent.endian)
def createEditableField(fieldset, field):
if isInteger(field):
cls = EditableInteger
elif isString(field):
cls = EditableString
elif field.__class__ in (RawBytes, Bytes, PaddingBytes):
cls = EditableBytes
elif field.__class__ in (RawBits, Bits, Bit, PaddingBits):
cls = EditableBits
elif field.__class__ == Character:
cls = EditableCharacter
else:
cls = FakeField
return cls(fieldset, field.name)
""" Takes care about pictures formats
References:
- JFIF: http://www.ecma-international.org/publications/techreports/E-TR-098.htm
- PNG: http://www.sno.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html
- PNG: http://www.w3.org/TR/PNG-Chunks.html
"""
import parser
class JpegStripper(parser.GenericParser):
""" Represents a jpeg file.
Custom Huffman and Quantization tables
are stripped: they may leak
some info, and the quality loss is minor.
"""
def _should_remove(self, field):
""" Return True if the field is compromising
"""
field_list = frozenset([
'start_image', # start of the image
'app0', # JFIF data
'start_frame', # specify width, height, number of components
'start_scan', # specify which slice of data the top-to-bottom scan contains
'data', # actual data
'end_image']) # end of the image
if field.name in field_list:
return False
elif field.name.startswith('quantization['): # custom Quant. tables
return False
elif field.name.startswith('huffman['): # custom Huffman tables
return False
return True
class PngStripper(parser.GenericParser):
""" Represents a png file
"""
def _should_remove(self, field):
""" Return True if the field is compromising
"""
field_list = frozenset([
'id',
'header', # PNG header
'physical', # the intended pixel size or aspect ratio
'end']) # end of the image
if field.name in field_list:
return False
elif field.name.startswith('data['): # data
return False
return True
......@@ -10,9 +10,6 @@ import platform
import subprocess
import xml.sax
import hachoir_core.cmd_line
import hachoir_parser
import libmat.exceptions
__version__ = '0.5.4'
......@@ -20,12 +17,10 @@ __author__ = 'jvoisin'
# Silence
LOGGING_LEVEL = logging.CRITICAL
hachoir_core.config.quiet = True
fname = ''
# Verbose
# LOGGING_LEVEL = logging.DEBUG
# hachoir_core.config.quiet = False
# logname = 'report.log'
logging.basicConfig(filename=fname, level=LOGGING_LEVEL)
......@@ -155,22 +150,10 @@ def create_class_file(name, backup, **kwargs):
elif not os.access(name, os.R_OK): # check read permissions
logging.error('%s is is not readable', name)
return None
elif not os.path.getsize(name): # check if the file is not empty (hachoir crash on empty files)
logging.error('%s is empty', name)
return None
try:
filename = hachoir_core.cmd_line.unicodeFilename(name)
except TypeError: # get rid of "decoding Unicode is not supported"
filename = name
parser = hachoir_parser.createParser(filename)
if not parser:
logging.info('Unable to parse %s with hachoir', filename)
mime = mimetypes.guess_type(name)[0]
if not mime:
logging.info('Unable to find mimetype of %s', filename)
logging.info('Unable to find mimetype of %s', name)
return None
if mime.startswith('application/vnd.oasis.opendocument'):
......@@ -186,4 +169,4 @@ def create_class_file(name, backup, **kwargs):
logging.info('Don\'t have stripper for %s format', mime)
return None
return stripper_class(filename, parser, mime, backup, is_writable, **kwargs)
return stripper_class(name, mime, backup, is_writable, **kwargs)
......@@ -11,8 +11,8 @@ class TorrentStripper(parser.GenericParser):
of the bencode lib from Petru Paler
"""
def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
super(TorrentStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
def __init__(self, filename, mime, backup, is_writable, **kwargs):
super(TorrentStripper, self).__init__(filename, mime, backup, is_writable, **kwargs)
self.fields = frozenset(['announce', 'info', 'name', 'path', 'piece length', 'pieces',
'length', 'files', 'announce-list', 'nodes', 'httpseeds', 'private', 'root hash'])
......
......@@ -3,11 +3,15 @@
import parser
from mutagen.flac import FLAC
from mutagen.oggvorbis import OggVorbis
from mutagen.mp3 import MP3
class MutagenStripper(parser.GenericParser):
""" Parser using the (awesome) mutagen library. """
def __init__(self, filename, parser, mime, backup, is_writable, **kwargs):
super(MutagenStripper, self).__init__(filename, parser, mime, backup, is_writable, **kwargs)
def __init__(self, filename, mime, backup, is_writable, **kwargs):
super(MutagenStripper, self).__init__(filename, mime, backup, is_writable, **kwargs)
self.mfile = None # This will be instanciated in self._create_mfile()
self._create_mfile()
......@@ -36,3 +40,61 @@ class MutagenStripper(parser.GenericParser):
for key, value in self.mfile.tags:
metadata[key] = value
return metadata
class MpegAudioStripper(MutagenStripper):
""" Represent a mp3 vorbis file
"""
def _create_mfile(self):
self.mfile = MP3(self.filename)
def get_meta(self):
"""
Return the content of the metadata block is present
"""
metadata = {}
if self.mfile.tags:
for key in self.mfile.tags.keys():
meta = self.mfile.tags[key]
try: # Sometimes, the field has a human-redable description
desc = meta.desc
except AttributeError:
desc = key
text = meta.text[0]
metadata[desc] = text
return metadata
class OggStripper(MutagenStripper):
""" Represent an ogg vorbis file