From 3323c1dbfade004d89c5492c84adc829095fe9fa Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sat, 8 Feb 2020 16:08:32 +0100
Subject: [PATCH] Remove a couple of residual metadata in pdf

This commit takes care of removing residual metadata
added by mat2 during the cleaning of pdf.
---
 libmat2/pdf.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/libmat2/pdf.py b/libmat2/pdf.py
index 547e071..cc83812 100644
--- a/libmat2/pdf.py
+++ b/libmat2/pdf.py
@@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser):
         document.set_creator('')
         document.set_creation_date(-1)
         document.save('file://' + os.path.abspath(out_file))
+
+        # Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
+        # fails to remove them, we have to use this terrible regex.
+        # It should(tm) be alright though, because cairo's output format
+        # for metadata is fixed.
+        with open(out_file, 'rb') as f:
+            out = re.sub(b'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
+                    re.DOTALL | re.IGNORECASE)
+        with open(out_file, 'wb') as f:
+            f.write(out)
+
         return True
 
     @staticmethod
-- 
GitLab