Skip to content
Snippets Groups Projects
Commit 5270071b authored by Julien (jvoisin) Voisin's avatar Julien (jvoisin) Voisin
Browse files

Remove a couple of residual metadata in pdf

This commit takes care of removing residual metadata
added by mat2 during the cleaning of pdf.
parent 5312603a
No related branches found
No related tags found
No related merge requests found
...@@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser): ...@@ -122,6 +122,17 @@ class PDFParser(abstract.AbstractParser):
document.set_creator('') document.set_creator('')
document.set_creation_date(-1) document.set_creation_date(-1)
document.save('file://' + os.path.abspath(out_file)) document.save('file://' + os.path.abspath(out_file))
# Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
# fails to remove them, we have to use this terrible regex.
# It should(tm) be alright though, because cairo's output format
# for metadata is fixed.
with open(out_file, 'rb') as f:
out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
re.DOTALL | re.IGNORECASE)
with open(out_file, 'wb') as f:
f.write(out)
return True return True
@staticmethod @staticmethod
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment