Skip to content
Snippets Groups Projects
Commit f1a071d4 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Implement lightweight cleaning for png and tiff

parent 38df679a
Branches
Tags
No related merge requests found
Pipeline #19899 passed
......@@ -6,11 +6,12 @@ max-locals=20
disable=
fixme,
invalid-name,
duplicate-code,
missing-docstring,
protected-access,
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
abstract-method,
wrong-import-position,
catching-non-exception,
cell-var-from-loop,
locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
import json
import logging
import os
import subprocess
from typing import Dict, Union, Set
......@@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser):
meta.pop(key, None)
return meta
def _lightweight_cleanup(self):
if os.path.exists(self.output_filename):
try:
# exiftool can't force output to existing files
os.remove(self.output_filename)
except OSError as e: # pragma: no cover
logging.error("The output file %s is already existing and \
can't be overwritten: %s.", self.filename, e)
return False
# Note: '-All=' must be followed by a known exiftool option.
# Also, '-CommonIFD0' is needed for .tiff files
cmd = [_get_exiftool_path(),
'-all=', # remove metadata
'-adobe=', # remove adobe-specific metadata
'-exif:all=', # remove all exif metadata
'-Time:All=', # remove all timestamps
'-quiet', # don't show useless logs
'-CommonIFD0=', # remove IFD0 metadata
'-o', self.output_filename,
self.filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError as e: # pragma: no cover
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
return False
return True
def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path):
......
......@@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser):
raise ValueError
def remove_all(self) -> bool:
if self.lightweight_cleaning:
return self._lightweight_cleanup()
surface = cairo.ImageSurface.create_from_png(self.filename)
surface.write_to_png(self.output_filename)
return True
......
......@@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser):
def remove_all(self):
cmd = [_get_ffmpeg_path(),
'-i', self.filename, # input file
'-i', self.filename, # input file
'-y', # overwrite existing output file
'-loglevel', 'panic', # Don't show log
'-hide_banner', # hide the banner
......
......@@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase):
images.JPGParser('./tests/data/clean.jpg')
os.remove('./tests/data/clean.jpg')
def test_png_lightweight(self):
return
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.png')
def test_avi(self):
try:
video._get_ffmpeg_path()
......
......@@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase):
os.remove('./tests/data/revision_clean.docx')
os.remove('./tests/data/revision_clean.cleaned.docx')
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
os.remove('./tests/data/clean.cleaned.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
class TestCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
......
#!/usr/bin/env python3
import unittest
import shutil
import os
from libmat2 import pdf, images
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
os.remove('./tests/data/clean.cleaned.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
p = images.PNGParser('./tests/data/clean.png')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
def test_jpg(self):
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
p = images.JPGParser('./tests/data/clean.jpg')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'Created with GIMP')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.jpg')
os.remove('./tests/data/clean.cleaned.jpg')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment