diff --git a/README.md b/README.md index 28ea2fd66486cff681b0ca97c09233ffa7c0c404..821e5e2bac7d4a108df8f57da6b4c3d56ed933a1 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. Copyright 2018 Julien (jvoisin) Voisin <julien.voisin+mat2@dustri.org> Copyright 2016 Marie Rose for MAT2's logo +The `tests/data/dirty_with_nsid.docx` file is licensed under GPLv3, +and was borrowed from the Calibre project: https://calibre-ebook.com/downloads/demos/demo.docx + # Thanks MAT2 wouldn't exist without: diff --git a/tests/data/dirty_with_nsid.docx b/tests/data/dirty_with_nsid.docx new file mode 100644 index 0000000000000000000000000000000000000000..6f4ae9943b9eebffb4ee0c778b7f57d223d7e0ca Binary files /dev/null and b/tests/data/dirty_with_nsid.docx differ diff --git a/tests/test_deep_cleaning.py b/tests/test_deep_cleaning.py index ccd4955efd2c2fd3fed8aad28c281668b461a1bc..aab46c76536ff0d86f6e7c17b36dd0ff2b6e965b 100644 --- a/tests/test_deep_cleaning.py +++ b/tests/test_deep_cleaning.py @@ -137,3 +137,34 @@ class TestRsidRemoval(unittest.TestCase): os.remove('./tests/data/clean.docx') os.remove('./tests/data/clean.cleaned.docx') + + +class TestNsidRemoval(unittest.TestCase): + def test_office(self): + shutil.copy('./tests/data/dirty_with_nsid.docx', './tests/data/clean.docx') + p = office.MSOfficeParser('./tests/data/clean.docx') + + meta = p.get_meta() + self.assertIsNotNone(meta) + + how_many_rsid = False + with zipfile.ZipFile('./tests/data/clean.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:rsid') + how_many_rsid += num + self.assertEqual(how_many_rsid, 1190) + + ret = p.remove_all() + self.assertTrue(ret) + + with zipfile.ZipFile('./tests/data/clean.cleaned.docx') as zin: + for item in zin.infolist(): + if not item.filename.endswith('.xml'): + continue + num = zin.read(item).decode('utf-8').lower().count('w:nsid') + self.assertEqual(num, 0) + + os.remove('./tests/data/clean.docx') + os.remove('./tests/data/clean.cleaned.docx')