Commit e70ea811 authored by jvoisin's avatar jvoisin

Implement support for .avi files, via ffmpeg

- This commit introduces optional dependencies (namely ffmpeg):
  mat2 will spit a warning when trying to process an .avi file
  if ffmpeg isn't installed.
- Since metadata are obtained via exiftool, this commit
  also refactors a bit our exfitool wrapper.
parent 2ae5d909
Pipeline #19861 passed with stages
in 5 minutes and 31 seconds
......@@ -42,7 +42,7 @@ tests:debian:
stage: test
script:
- apt-get -qqy update
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage
- apt-get -qqy install --no-install-recommends python3-mutagen python3-gi-cairo gir1.2-poppler-0.18 gir1.2-gdkpixbuf-2.0 libimage-exiftool-perl python3-coverage ffmpeg
- python3-coverage run --branch -m unittest discover -s tests/
- python3-coverage report --fail-under=100 -m --include 'libmat2/*'
......@@ -62,5 +62,5 @@ tests:archlinux:
tags:
- whitewhale
script:
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap
- pacman -Sy --noconfirm python-mutagen python-gobject gdk-pixbuf2 poppler-glib gdk-pixbuf2 python-cairo perl-image-exiftool python-setuptools mailcap ffmpeg
- python3 setup.py test
#!/usr/bin/env python3
import os
import collections
import enum
import importlib
from typing import Dict, Optional
from . import exiftool, video
# make pyflakes happy
assert Dict
assert Optional
......@@ -37,24 +38,13 @@ DEPENDENCIES = {
'mutagen': 'Mutagen',
}
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise ValueError
def check_dependencies() -> dict:
ret = collections.defaultdict(bool) # type: Dict[str, bool]
ret['Exiftool'] = True if _get_exiftool_path() else False
ret['Exiftool'] = True if exiftool._get_exiftool_path() else False
ret['Ffmpeg'] = True if video._get_ffmpeg_path() else False
for key, value in DEPENDENCIES.items():
ret[value] = True
......
......@@ -7,7 +7,8 @@ assert Set # make pyflakes happy
class AbstractParser(abc.ABC):
""" This is the base class of every parser.
It might yield `ValueError` on instantiation on invalid files.
It might yield `ValueError` on instantiation on invalid files,
and `RuntimeError` when something went wrong in `remove_all`.
"""
meta_list = set() # type: Set[str]
mimetypes = set() # type: Set[str]
......@@ -27,4 +28,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod
def remove_all(self) -> bool:
"""
:raises RuntimeError: Raised if the cleaning process went wrong.
"""
pass # pragma: no cover
import json
import os
import re
import shutil
import subprocess
import tempfile
from typing import Dict, Union, Set
from . import abstract
# Make pyflakes happy
assert Set
class ExiftoolParser(abstract.AbstractParser):
""" Exiftool is often the easiest way to get all the metadata
from a import file, hence why several parsers are re-using its `get_meta`
method.
"""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> bytes:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
# ArchLinux
exiftool_path = '/usr/bin/vendor_perl/exiftool'
if os.path.isfile(exiftool_path):
if os.access(exiftool_path, os.X_OK):
return exiftool_path
raise RuntimeError("Unable to find exiftool")
import subprocess
import imghdr
import json
import os
import shutil
import tempfile
import re
from typing import Set, Dict, Union
from typing import Set
import cairo
......@@ -13,44 +8,12 @@ import gi
gi.require_version('GdkPixbuf', '2.0')
from gi.repository import GdkPixbuf
from . import abstract, _get_exiftool_path
from . import exiftool
# Make pyflakes happy
assert Set
class _ImageParser(abstract.AbstractParser):
""" Since we use `exiftool` to get metadata from
all images fileformat, `get_meta` is implemented in this class,
and all the image-handling ones are inheriting from it."""
meta_whitelist = set() # type: Set[str]
@staticmethod
def __handle_problematic_filename(filename: str, callback) -> bytes:
""" This method takes a filename with a problematic name,
and safely applies it a `callback`."""
tmpdirname = tempfile.mkdtemp()
fname = os.path.join(tmpdirname, "temp_file")
shutil.copy(filename, fname)
out = callback(fname)
shutil.rmtree(tmpdirname)
return out
def get_meta(self) -> Dict[str, Union[str, dict]]:
""" There is no way to escape the leading(s) dash(es) of the current
self.filename to prevent parameter injections, so we need to take care
of this.
"""
fun = lambda f: subprocess.check_output([_get_exiftool_path(), '-json', f])
if re.search('^[a-z0-9/]', self.filename) is None:
out = self.__handle_problematic_filename(self.filename, fun)
else:
out = fun(self.filename)
meta = json.loads(out.decode('utf-8'))[0]
for key in self.meta_whitelist:
meta.pop(key, None)
return meta
class PNGParser(_ImageParser):
class PNGParser(exiftool.ExiftoolParser):
mimetypes = {'image/png', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName',
'Directory', 'FileSize', 'FileModifyDate',
......@@ -77,7 +40,7 @@ class PNGParser(_ImageParser):
return True
class GdkPixbufAbstractParser(_ImageParser):
class GdkPixbufAbstractParser(exiftool.ExiftoolParser):
""" GdkPixbuf can handle a lot of surfaces, so we're rending images on it,
this has the side-effect of completely removing metadata.
"""
......
......@@ -18,6 +18,8 @@ def __load_all_parsers():
continue
elif fname.endswith('__init__.py'):
continue
elif fname.endswith('exiftool.py'):
continue
basename = os.path.basename(fname)
name, _ = os.path.splitext(basename)
importlib.import_module('.' + name, package='libmat2')
......
import os
import subprocess
from . import exiftool
class AVIParser(exiftool.ExiftoolParser):
mimetypes = {'video/x-msvideo', }
meta_whitelist = {'SourceFile', 'ExifToolVersion', 'FileName', 'Directory',
'FileSize', 'FileModifyDate', 'FileAccessDate',
'FileInodeChangeDate', 'FilePermissions', 'FileType',
'FileTypeExtension', 'MIMEType', 'FrameRate', 'MaxDataRate',
'FrameCount', 'StreamCount', 'StreamType', 'VideoCodec',
'VideoFrameRate', 'VideoFrameCount', 'Quality',
'SampleSize', 'BMPVersion', 'ImageWidth', 'ImageHeight',
'Planes', 'BitDepth', 'Compression', 'ImageLength',
'PixelsPerMeterX', 'PixelsPerMeterY', 'NumColors',
'NumImportantColors', 'NumColors', 'NumImportantColors',
'RedMask', 'GreenMask', 'BlueMask', 'AlphaMask',
'ColorSpace', 'AudioCodec', 'AudioCodecRate',
'AudioSampleCount', 'AudioSampleCount',
'AudioSampleRate', 'Encoding', 'NumChannels',
'SampleRate', 'AvgBytesPerSec', 'BitsPerSample',
'Duration', 'ImageSize', 'Megapixels'}
def remove_all(self) -> bool:
"""
TODO: handle problematic filenames starting with `-` and `--`,
check exiftool.py
"""
cmd = [_get_ffmpeg_path(),
'-i', self.filename, # input file
'-y', # overwrite existing output file
'-loglevel', 'panic', # Don't show log
'-hide_banner', # hide the banner
'-codec', 'copy', # don't decode anything, just copy (speed!)
'-map_metadata', '-1', # remove supperficial metadata
'-map_chapters', '-1', # remove chapters
'-fflags', '+bitexact', # don't add any metadata
'-flags:v', '+bitexact', # don't add any metadata
'-flags:a', '+bitexact', # don't add any metadata
self.output_filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError: # pragma: no cover
return False
return True
def _get_ffmpeg_path() -> str: # pragma: no cover
ffmpeg_path = '/usr/bin/ffmpeg'
if os.path.isfile(ffmpeg_path):
if os.access(ffmpeg_path, os.X_OK):
return ffmpeg_path
raise RuntimeError("Unable to find ffmpeg")
......@@ -97,7 +97,13 @@ def clean_meta(filename: str, is_lightweight: bool, policy: UnknownMemberPolicy)
return False
p.unknown_member_policy = policy
p.lightweight_cleaning = is_lightweight
return p.remove_all()
try:
return p.remove_all()
except RuntimeError as e:
print("[-] %s can't be cleaned: %s" % (filename, e))
return False
def show_parsers() -> bool:
......
......@@ -6,12 +6,16 @@ import os
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
from libmat2 import check_dependencies
from libmat2 import check_dependencies, video
class TestCheckDependencies(unittest.TestCase):
def test_deps(self):
ret = check_dependencies()
try:
ret = check_dependencies()
except RuntimeError:
return # this happens if not every dependency is installed
for value in ret.values():
self.assertTrue(value)
......@@ -471,3 +475,24 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.txt')
os.remove('./tests/data/clean.cleaned.txt')
os.remove('./tests/data/clean.cleaned.cleaned.txt')
def test_avi(self):
shutil.copy('./tests/data/dirty.avi', './tests/data/clean.avi')
p = video.AVIParser('./tests/data/clean.avi')
meta = p.get_meta()
self.assertEqual(meta['Software'], 'MEncoder SVN-r33148-4.0.1')
try:
ret = p.remove_all()
except RuntimeError:
return # this happens if ffmepg is not installed
self.assertTrue(ret)
p = video.AVIParser('./tests/data/clean.cleaned.avi')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.avi')
os.remove('./tests/data/clean.cleaned.avi')
os.remove('./tests/data/clean.cleaned.cleaned.avi')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment