Skip to content
Snippets Groups Projects
Commit f1a071d4 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Implement lightweight cleaning for png and tiff

parent 38df679a
No related branches found
No related tags found
No related merge requests found
...@@ -6,11 +6,12 @@ max-locals=20 ...@@ -6,11 +6,12 @@ max-locals=20
disable= disable=
fixme, fixme,
invalid-name, invalid-name,
duplicate-code,
missing-docstring, missing-docstring,
protected-access, protected-access,
abstract-method, abstract-method,
wrong-import-position, wrong-import-position,
catching-non-exception, catching-non-exception,
cell-var-from-loop, cell-var-from-loop,
locally-disabled, locally-disabled,
invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation invalid-sequence-index, # pylint doesn't like things like `Tuple[int, bytes]` in type annotation
import json import json
import logging
import os import os
import subprocess import subprocess
from typing import Dict, Union, Set from typing import Dict, Union, Set
...@@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser): ...@@ -23,6 +24,34 @@ class ExiftoolParser(abstract.AbstractParser):
meta.pop(key, None) meta.pop(key, None)
return meta return meta
def _lightweight_cleanup(self):
if os.path.exists(self.output_filename):
try:
# exiftool can't force output to existing files
os.remove(self.output_filename)
except OSError as e: # pragma: no cover
logging.error("The output file %s is already existing and \
can't be overwritten: %s.", self.filename, e)
return False
# Note: '-All=' must be followed by a known exiftool option.
# Also, '-CommonIFD0' is needed for .tiff files
cmd = [_get_exiftool_path(),
'-all=', # remove metadata
'-adobe=', # remove adobe-specific metadata
'-exif:all=', # remove all exif metadata
'-Time:All=', # remove all timestamps
'-quiet', # don't show useless logs
'-CommonIFD0=', # remove IFD0 metadata
'-o', self.output_filename,
self.filename]
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError as e: # pragma: no cover
logging.error("Something went wrong during the processing of %s: %s", self.filename, e)
return False
return True
def _get_exiftool_path() -> str: # pragma: no cover def _get_exiftool_path() -> str: # pragma: no cover
exiftool_path = '/usr/bin/exiftool' exiftool_path = '/usr/bin/exiftool'
if os.path.isfile(exiftool_path): if os.path.isfile(exiftool_path):
......
...@@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser): ...@@ -35,6 +35,8 @@ class PNGParser(exiftool.ExiftoolParser):
raise ValueError raise ValueError
def remove_all(self) -> bool: def remove_all(self) -> bool:
if self.lightweight_cleaning:
return self._lightweight_cleanup()
surface = cairo.ImageSurface.create_from_png(self.filename) surface = cairo.ImageSurface.create_from_png(self.filename)
surface.write_to_png(self.output_filename) surface.write_to_png(self.output_filename)
return True return True
......
...@@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser): ...@@ -26,7 +26,7 @@ class AVIParser(exiftool.ExiftoolParser):
def remove_all(self): def remove_all(self):
cmd = [_get_ffmpeg_path(), cmd = [_get_ffmpeg_path(),
'-i', self.filename, # input file '-i', self.filename, # input file
'-y', # overwrite existing output file '-y', # overwrite existing output file
'-loglevel', 'panic', # Don't show log '-loglevel', 'panic', # Don't show log
'-hide_banner', # hide the banner '-hide_banner', # hide the banner
......
...@@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase): ...@@ -194,6 +194,13 @@ class TestCorruptedFiles(unittest.TestCase):
images.JPGParser('./tests/data/clean.jpg') images.JPGParser('./tests/data/clean.jpg')
os.remove('./tests/data/clean.jpg') os.remove('./tests/data/clean.jpg')
def test_png_lightweight(self):
return
shutil.copy('./tests/data/dirty.torrent', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
self.assertTrue(p.remove_all())
os.remove('./tests/data/clean.png')
def test_avi(self): def test_avi(self):
try: try:
video._get_ffmpeg_path() video._get_ffmpeg_path()
......
...@@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase): ...@@ -212,42 +212,6 @@ class TestRevisionsCleaning(unittest.TestCase):
os.remove('./tests/data/revision_clean.docx') os.remove('./tests/data/revision_clean.docx')
os.remove('./tests/data/revision_clean.cleaned.docx') os.remove('./tests/data/revision_clean.cleaned.docx')
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
os.remove('./tests/data/clean.cleaned.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
class TestCleaning(unittest.TestCase): class TestCleaning(unittest.TestCase):
def test_pdf(self): def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf') shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
......
#!/usr/bin/env python3
import unittest
import shutil
import os
from libmat2 import pdf, images
class TestLightWeightCleaning(unittest.TestCase):
def test_pdf(self):
shutil.copy('./tests/data/dirty.pdf', './tests/data/clean.pdf')
p = pdf.PDFParser('./tests/data/clean.pdf')
meta = p.get_meta()
self.assertEqual(meta['producer'], 'pdfTeX-1.40.14')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = pdf.PDFParser('./tests/data/clean.cleaned.pdf')
expected_meta = {'creation-date': -1, 'format': 'PDF-1.5', 'mod-date': -1}
self.assertEqual(p.get_meta(), expected_meta)
os.remove('./tests/data/clean.pdf')
os.remove('./tests/data/clean.cleaned.pdf')
def test_png(self):
shutil.copy('./tests/data/dirty.png', './tests/data/clean.png')
p = images.PNGParser('./tests/data/clean.png')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'This is a comment, be careful!')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.PNGParser('./tests/data/clean.cleaned.png')
self.assertEqual(p.get_meta(), {})
p = images.PNGParser('./tests/data/clean.png')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
os.remove('./tests/data/clean.png')
os.remove('./tests/data/clean.cleaned.png')
def test_jpg(self):
shutil.copy('./tests/data/dirty.jpg', './tests/data/clean.jpg')
p = images.JPGParser('./tests/data/clean.jpg')
meta = p.get_meta()
self.assertEqual(meta['Comment'], 'Created with GIMP')
p.lightweight_cleaning = True
ret = p.remove_all()
self.assertTrue(ret)
p = images.JPGParser('./tests/data/clean.cleaned.jpg')
self.assertEqual(p.get_meta(), {})
os.remove('./tests/data/clean.jpg')
os.remove('./tests/data/clean.cleaned.jpg')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment