Commit 82cc822a authored by jvoisin's avatar jvoisin

Add tar archive support

parent 20ed5eb7
Pipeline #24294 passed with stages
in 1 minute and 45 seconds
This diff is collapsed.
......@@ -5,7 +5,7 @@ import xml.etree.ElementTree as ET # type: ignore
from . import archive, office
class EPUBParser(archive.ArchiveBasedAbstractParser):
class EPUBParser(archive.ZipParser):
mimetypes = {'application/epub+zip', }
metadata_namespace = '{http://purl.org/dc/elements/1.1/}'
......
......@@ -6,7 +6,7 @@ from typing import Dict, Set, Pattern, Tuple, Any
import xml.etree.ElementTree as ET # type: ignore
from .archive import ArchiveBasedAbstractParser
from .archive import ZipParser
# pylint: disable=line-too-long
......@@ -43,7 +43,7 @@ def _sort_xml_attributes(full_path: str) -> bool:
return True
class MSOfficeParser(ArchiveBasedAbstractParser):
class MSOfficeParser(ZipParser):
mimetypes = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
......@@ -312,7 +312,7 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return {file_path: 'harmful content', }
class LibreOfficeParser(ArchiveBasedAbstractParser):
class LibreOfficeParser(ZipParser):
mimetypes = {
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
......
#!/usr/bin/env python3
import unittest
import time
import shutil
import os
import logging
import zipfile
import tarfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent
from libmat2 import harmless, video, web
from libmat2 import harmless, video, web, archive
# No need to logging messages, should something go wrong,
# the testsuite _will_ fail.
......@@ -278,7 +280,6 @@ class TestCorruptedFiles(unittest.TestCase):
p.remove_all()
os.remove('./tests/data/clean.html')
def test_epub(self):
with zipfile.ZipFile('./tests/data/clean.epub', 'w') as zout:
zout.write('./tests/data/dirty.jpg', 'OEBPS/content.opf')
......@@ -291,3 +292,27 @@ class TestCorruptedFiles(unittest.TestCase):
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.epub')
def test_tar(self):
with tarfile.TarFile('./tests/data/clean.tar', 'w') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
zout.add('./tests/data/embedded_corrupted.docx')
tarinfo = tarfile.TarInfo(name='./tests/data/dirty.png')
tarinfo.mtime = time.time()
tarinfo.uid = 1337
tarinfo.gid = 1338
with open('./tests/data/dirty.png', 'rb') as f:
zout.addfile(tarinfo, f)
p, mimetype = parser_factory.get_parser('./tests/data/clean.tar')
self.assertEqual(mimetype, 'application/x-tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
self.assertFalse(p.remove_all())
os.remove('./tests/data/clean.tar')
shutil.copy('./tests/data/dirty.png', './tests/data/clean.tar')
with self.assertRaises(ValueError):
archive.TarParser('./tests/data/clean.tar')
os.remove('./tests/data/clean.tar')
......@@ -4,6 +4,8 @@ import unittest
import shutil
import os
import re
import tarfile
import tempfile
import zipfile
from libmat2 import pdf, images, audio, office, parser_factory, torrent, harmless
......@@ -195,6 +197,19 @@ class TestGetMeta(unittest.TestCase):
self.assertEqual(meta['version'], '1.0')
self.assertEqual(meta['harmful data'], 'underline is cool')
def test_tar(self):
with tarfile.TarFile('./tests/data/dirty.tar', 'w') as tout:
tout.add('./tests/data/dirty.flac')
tout.add('./tests/data/dirty.docx')
tout.add('./tests/data/dirty.jpg')
p, mimetype = parser_factory.get_parser('./tests/data/dirty.tar')
self.assertEqual(mimetype, 'application/x-tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.flac']['comments'], 'Thank you for using MAT !')
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
os.remove('./tests/data/dirty.tar')
class TestRemovingThumbnails(unittest.TestCase):
def test_odt(self):
shutil.copy('./tests/data/revision.odt', './tests/data/clean.odt')
......@@ -702,3 +717,38 @@ class TestCleaning(unittest.TestCase):
os.remove('./tests/data/clean.css')
os.remove('./tests/data/clean.cleaned.css')
os.remove('./tests/data/clean.cleaned.cleaned.css')
def test_tar(self):
with tarfile.TarFile('./tests/data/dirty.tar', 'w') as zout:
zout.add('./tests/data/dirty.flac')
zout.add('./tests/data/dirty.docx')
zout.add('./tests/data/dirty.jpg')
p = archive.TarParser('./tests/data/dirty.tar')
meta = p.get_meta()
self.assertEqual(meta['./tests/data/dirty.docx']['word/media/image1.png']['Comment'], 'This is a comment, be careful!')
ret = p.remove_all()
self.assertTrue(ret)
p = archive.TarParser('./tests/data/dirty.cleaned.tar')
self.assertEqual(p.get_meta(), {})
self.assertTrue(p.remove_all())
tmp_dir = tempfile.mkdtemp()
with tarfile.open('./tests/data/dirty.cleaned.tar') as zout:
zout.extractall(path=tmp_dir)
zout.close()
number_of_files = 0
for root, _, fnames in os.walk(tmp_dir):
for f in fnames:
complete_path = os.path.join(root, f)
p, _ = parser_factory.get_parser(complete_path)
self.assertIsNotNone(p)
self.assertEqual(p.get_meta(), {})
number_of_files += 1
self.assertEqual(number_of_files, 3)
os.remove('./tests/data/dirty.tar')
os.remove('./tests/data/dirty.cleaned.tar')
os.remove('./tests/data/dirty.cleaned.cleaned.tar')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment