Commit b02d7288 authored by jvoisin's avatar jvoisin

Test for faulty files, and document how MAT2 is behaving wrt. them

parent 459e9b82
......@@ -9,6 +9,14 @@ that only cleans the superficial metadata of your file, but not
the ones that might be in **embeded** resources. Like for example,
images in a PDF or an office document.
Race conditions
MAT2 does its very best to avoid crashing at runtime. This is why it's checking
if the file is valid __at parser creation__. MAT2 doesn't take any measure to
ensure that the file is not changed between the time the parser is
instantiated, and the call to clean or show the metadata.
Symlink attacks
......@@ -20,6 +20,13 @@ class PNGParser(abstract.AbstractParser):
'Compression', 'Filter', 'Interlace', 'BackgroundColor', 'ImageSize',
'Megapixels', 'ImageHeight'}
def __init__(self, filename):
try: # better fail here than later
except MemoryError:
raise ValueError
def get_meta(self):
out = subprocess.check_output(['/usr/bin/exiftool', '-json', self.filename])
meta = json.loads(out.decode('utf-8'))[0]
......@@ -30,5 +30,8 @@ def get_parser(filename: str) -> (T, str):
for c in _get_parsers():
if mtype in c.mimetypes:
return c(filename), mtype
return c(filename), mtype
except ValueError:
return None, mtype
return None, mtype
......@@ -11,7 +11,7 @@ import io
import cairo
import gi
gi.require_version('Poppler', '0.18')
from gi.repository import Poppler
from gi.repository import Poppler, GLib
from . import abstract
......@@ -28,6 +28,10 @@ class PDFParser(abstract.AbstractParser):
self.uri = 'file://' + os.path.abspath(self.filename)
self.__scale = 2 # how much precision do we want for the render
try: # Check now that the file is valid, to avoid surprises later
Poppler.Document.new_from_file(self.uri, None)
except GLib.GError: # Invalid PDF
raise ValueError
def remove_all_lightweight(self):
......@@ -116,8 +120,9 @@ class PDFParser(abstract.AbstractParser):
def get_meta(self):
""" Return a dict with all the meta of the file
document = Poppler.Document.new_from_file(self.uri, None)
metadata = {}
document = Poppler.Document.new_from_file(self.uri, None)
for key in self.meta_list:
if document.get_property(key):
metadata[key] = document.get_property(key)
......@@ -16,6 +16,18 @@ class TestParserFactory(unittest.TestCase):
self.assertEqual(mimetype, 'audio/mpeg')
self.assertEqual(parser.__class__, audio.MP3Parser)
class TestCorruptedFiles(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
with self.assertRaises(ValueError):
def test_png(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
with self.assertRaises(ValueError):
class TestGetMeta(unittest.TestCase):
def test_pdf(self):
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment