Skip to content
Snippets Groups Projects
Commit b9a62d79 authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Refactor a bit office get_meta handling

This should make easier to get more metadata from
archive-based file formats.
parent 54e50450
No related branches found
No related tags found
No related merge requests found
...@@ -4,7 +4,7 @@ import tempfile ...@@ -4,7 +4,7 @@ import tempfile
import os import os
import logging import logging
import shutil import shutil
from typing import Dict, Set, Pattern, Union from typing import Dict, Set, Pattern, Union, Any
from . import abstract, UnknownMemberPolicy, parser_factory from . import abstract, UnknownMemberPolicy, parser_factory
...@@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -42,6 +42,12 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
# pylint: disable=unused-argument,no-self-use # pylint: disable=unused-argument,no-self-use
return True # pragma: no cover return True # pragma: no cover
def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
""" This method can be used to extract specific metadata
from files present in the archive."""
# pylint: disable=unused-argument,no-self-use
return {} # pragma: no cover
@staticmethod @staticmethod
def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo: def _clean_zipinfo(zipinfo: zipfile.ZipInfo) -> zipfile.ZipInfo:
zipinfo.create_system = 3 # Linux zipinfo.create_system = 3 # Linux
...@@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -74,6 +80,10 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
temp_folder = tempfile.mkdtemp() temp_folder = tempfile.mkdtemp()
for item in zin.infolist(): for item in zin.infolist():
local_meta = dict() # type: Dict[str, Union[str, Dict]]
for k, v in self._get_zipinfo_meta(item).items():
local_meta[k] = v
if item.filename[-1] == '/': # pragma: no cover if item.filename[-1] == '/': # pragma: no cover
# `is_dir` is added in Python3.6 # `is_dir` is added in Python3.6
continue # don't keep empty folders continue # don't keep empty folders
...@@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser): ...@@ -81,11 +91,15 @@ class ArchiveBasedAbstractParser(abstract.AbstractParser):
zin.extract(member=item, path=temp_folder) zin.extract(member=item, path=temp_folder)
full_path = os.path.join(temp_folder, item.filename) full_path = os.path.join(temp_folder, item.filename)
specific_meta = self._specific_get_meta(full_path, item.filename)
for (k, v) in specific_meta.items():
local_meta[k] = v
tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore tmp_parser, _ = parser_factory.get_parser(full_path) # type: ignore
if not tmp_parser: if tmp_parser:
continue for k, v in tmp_parser.get_meta().items():
local_meta[k] = v
local_meta = tmp_parser.get_meta()
if local_meta: if local_meta:
meta[item.filename] = local_meta meta[item.filename] = local_meta
......
...@@ -2,7 +2,7 @@ import logging ...@@ -2,7 +2,7 @@ import logging
import os import os
import re import re
import zipfile import zipfile
from typing import Dict, Set, Pattern, Tuple, Union from typing import Dict, Set, Pattern, Tuple, Union, Any
import xml.etree.ElementTree as ET # type: ignore import xml.etree.ElementTree as ET # type: ignore
...@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser): ...@@ -295,26 +295,21 @@ class MSOfficeParser(ArchiveBasedAbstractParser):
return True return True
def get_meta(self) -> Dict[str, Union[str, dict]]: def _specific_get_meta(self, full_path: str, file_path: str) -> Dict[str, Any]:
""" """
Yes, I know that parsing xml with regexp ain't pretty, Yes, I know that parsing xml with regexp ain't pretty,
be my guest and fix it if you want. be my guest and fix it if you want.
""" """
metadata = super().get_meta() if not file_path.startswith('docProps/') or not file_path.endswith('.xml'):
zipin = zipfile.ZipFile(self.filename) return {}
for item in zipin.infolist():
if item.filename.startswith('docProps/') and item.filename.endswith('.xml'): with open(full_path, encoding='utf-8') as f:
try: try:
content = zipin.read(item).decode('utf-8') results = re.findall(r"<(.+)>(.+)</\1>", f.read(), re.I|re.M)
results = re.findall(r"<(.+)>(.+)</\1>", content, re.I|re.M) return {k:v for (k, v) in results}
for (key, value) in results: except (TypeError, UnicodeDecodeError):
metadata[key] = value # We didn't manage to parse the xml file
except (TypeError, UnicodeDecodeError): # We didn't manage to parse the xml file return {file_path: 'harmful content', }
metadata[item.filename] = 'harmful content'
for key, value in self._get_zipinfo_meta(item).items():
metadata[key] = value
zipin.close()
return metadata
class LibreOfficeParser(ArchiveBasedAbstractParser): class LibreOfficeParser(ArchiveBasedAbstractParser):
......
...@@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase): ...@@ -131,9 +131,9 @@ class TestGetMeta(unittest.TestCase):
def test_docx(self): def test_docx(self):
p = office.MSOfficeParser('./tests/data/dirty.docx') p = office.MSOfficeParser('./tests/data/dirty.docx')
meta = p.get_meta() meta = p.get_meta()
self.assertEqual(meta['cp:lastModifiedBy'], 'Julien Voisin') self.assertEqual(meta['docProps/core.xml']['cp:lastModifiedBy'], 'Julien Voisin')
self.assertEqual(meta['dc:creator'], 'julien voisin') self.assertEqual(meta['docProps/core.xml']['dc:creator'], 'julien voisin')
self.assertEqual(meta['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1') self.assertEqual(meta['docProps/app.xml']['Application'], 'LibreOffice/5.4.5.1$Linux_X86_64 LibreOffice_project/40m0$Build-1')
def test_libreoffice(self): def test_libreoffice(self):
p = office.LibreOfficeParser('./tests/data/dirty.odt') p = office.LibreOfficeParser('./tests/data/dirty.odt')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment