Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
mat2
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
atenart
mat2
Commits
96299c6a
Commit
96299c6a
authored
7 years ago
by
Julien (jvoisin) Voisin
Browse files
Options
Downloads
Patches
Plain Diff
Add lightweight processing for PDF
parent
6f4ed249
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
main.py
+10
-5
10 additions, 5 deletions
main.py
src/abstract.py
+4
-0
4 additions, 0 deletions
src/abstract.py
src/pdf.py
+37
-8
37 additions, 8 deletions
src/pdf.py
tests/test_climat2.py
+2
-2
2 additions, 2 deletions
tests/test_climat2.py
tests/test_libmat2.py
+31
-0
31 additions, 0 deletions
tests/test_libmat2.py
with
84 additions
and
15 deletions
main.py
+
10
−
5
View file @
96299c6a
...
...
@@ -31,6 +31,8 @@ def create_arg_parser():
help
=
'
list all supported fileformats
'
)
info
.
add_argument
(
'
-s
'
,
'
--show
'
,
action
=
'
store_true
'
,
help
=
'
list all the harmful metadata of a file without removing them
'
)
info
.
add_argument
(
'
-L
'
,
'
--lightweight
'
,
action
=
'
store_true
'
,
help
=
'
remove SOME metadata
'
)
return
parser
...
...
@@ -50,7 +52,7 @@ def show_meta(filename:str):
print
(
"
%s: harmful content
"
%
k
)
def
clean_meta
(
filename
:
str
):
def
clean_meta
(
filename
:
str
,
is_lightweigth
:
bool
):
if
not
__check_file
(
filename
,
os
.
R_OK
|
os
.
W_OK
):
return
...
...
@@ -58,7 +60,10 @@ def clean_meta(filename:str):
if
p
is
None
:
print
(
"
[-] %s
'
s format (%s) is not supported
"
%
(
filename
,
mtype
))
return
p
.
remove_all
()
if
is_lightweigth
:
p
.
remove_all_lightweight
()
else
:
p
.
remove_all
()
def
show_parsers
():
...
...
@@ -78,12 +83,12 @@ def __get_files_recursively(files):
for
_f
in
_files
:
yield
os
.
path
.
join
(
path
,
_f
)
def
__do_clean_async
(
q
):
def
__do_clean_async
(
is_lightweigth
,
q
):
while
True
:
f
=
q
.
get
()
if
f
is
None
:
# nothing more to process
return
clean_meta
(
f
)
clean_meta
(
is_lightweigth
,
f
)
q
.
task_done
()
...
...
@@ -109,7 +114,7 @@ def main():
q
.
put
(
f
)
for
_
in
range
(
multiprocessing
.
cpu_count
()):
worker
=
Thread
(
target
=
__do_clean_async
,
args
=
(
q
,
))
worker
=
Thread
(
target
=
__do_clean_async
,
args
=
(
mode
,
q
))
worker
.
start
()
threads
.
append
(
worker
)
...
...
This diff is collapsed.
Click to expand it.
src/abstract.py
+
4
−
0
View file @
96299c6a
...
...
@@ -16,3 +16,7 @@ class AbstractParser(abc.ABC):
@abc.abstractmethod
def
remove_all
(
self
)
->
bool
:
pass
def
remove_all_lightweight
(
self
)
->
bool
:
"""
Remove _SOME_ metadata.
"""
return
self
.
remove_all
()
This diff is collapsed.
Click to expand it.
src/pdf.py
+
37
−
8
View file @
96299c6a
...
...
@@ -29,18 +29,43 @@ class PDFParser(abstract.AbstractParser):
self
.
uri
=
'
file://
'
+
os
.
path
.
abspath
(
self
.
filename
)
self
.
__scale
=
2
# how much precision do we want for the render
def
remove_all_lightweight
(
self
):
"""
Load the document into Poppler, render pages on a new PDFSurface.
"""
document
=
Poppler
.
Document
.
new_from_file
(
self
.
uri
,
None
)
pages_count
=
document
.
get_n_pages
()
tmp_path
=
tempfile
.
mkstemp
()[
1
]
pdf_surface
=
cairo
.
PDFSurface
(
tmp_path
,
10
,
10
)
pdf_context
=
cairo
.
Context
(
pdf_surface
)
# context draws on the surface
for
pagenum
in
range
(
pages_count
):
logging
.
info
(
"
Rendering page %d/%d
"
,
pagenum
+
1
,
pages_count
)
page
=
document
.
get_page
(
pagenum
)
page_width
,
page_height
=
page
.
get_size
()
pdf_surface
.
set_size
(
page_width
,
page_height
)
pdf_context
.
save
()
page
.
render_for_printing
(
pdf_context
)
pdf_context
.
restore
()
pdf_context
.
show_page
()
# draw pdf_context on pdf_surface
pdf_surface
.
finish
()
self
.
__remove_superficial_meta
(
tmp_path
,
self
.
output_filename
)
os
.
remove
(
tmp_path
)
return
True
def
remove_all
(
self
):
"""
Load the document into Poppler, render pages on PNG,
and shove those PNG into a new PDF. Metadata from the new
PDF are removed via Poppler, because there is no way to tell
cairo to not add
"
created by cairo
"
during rendering.
and shove those PNG into a new PDF.
"""
document
=
Poppler
.
Document
.
new_from_file
(
self
.
uri
,
None
)
pages_count
=
document
.
get_n_pages
()
_
,
tmp_path
=
tempfile
.
mkstemp
()
pdf_surface
=
cairo
.
PDFSurface
(
tmp_path
,
128
,
128
)
pdf_surface
=
cairo
.
PDFSurface
(
tmp_path
,
32
,
32
)
# resized later anyway
pdf_context
=
cairo
.
Context
(
pdf_surface
)
for
pagenum
in
range
(
pages_count
):
...
...
@@ -69,14 +94,18 @@ class PDFParser(abstract.AbstractParser):
pdf_surface
.
finish
()
# Removes metadata added by Poppler
document
=
Poppler
.
Document
.
new_from_file
(
'
file://
'
+
tmp_path
)
document
.
set_producer
(
''
)
document
.
set_creator
(
''
)
document
.
save
(
'
file://
'
+
os
.
path
.
abspath
(
self
.
output_filename
))
self
.
__remove_superficial_meta
(
tmp_path
,
self
.
output_filename
)
os
.
remove
(
tmp_path
)
return
True
def
__remove_superficial_meta
(
self
,
in_file
:
str
,
out_file
:
str
)
->
bool
:
document
=
Poppler
.
Document
.
new_from_file
(
'
file://
'
+
in_file
)
document
.
set_producer
(
''
)
document
.
set_creator
(
''
)
document
.
save
(
'
file://
'
+
os
.
path
.
abspath
(
out_file
))
return
True
def
__parse_metadata_field
(
self
,
data
:
str
)
->
dict
:
metadata
=
{}
...
...
This diff is collapsed.
Click to expand it.
tests/test_climat2.py
+
2
−
2
View file @
96299c6a
...
...
@@ -6,12 +6,12 @@ class TestHelp(unittest.TestCase):
def
test_help
(
self
):
proc
=
subprocess
.
Popen
([
'
./main.py
'
,
'
--help
'
],
stdout
=
subprocess
.
PIPE
)
stdout
,
_
=
proc
.
communicate
()
self
.
assertIn
(
b
'
usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]
'
,
stdout
)
self
.
assertIn
(
b
'
usage: main.py [-h] [-c] [-l] [-s]
[-L]
[files [files ...]]
'
,
stdout
)
def
test_no_arg
(
self
):
proc
=
subprocess
.
Popen
([
'
./main.py
'
],
stdout
=
subprocess
.
PIPE
)
stdout
,
_
=
proc
.
communicate
()
self
.
assertIn
(
b
'
usage: main.py [-h] [-c] [-l] [-s] [files [files ...]]
'
,
stdout
)
self
.
assertIn
(
b
'
usage: main.py [-h] [-c] [-l] [-s]
[-L]
[files [files ...]]
'
,
stdout
)
class
TestGetMeta
(
unittest
.
TestCase
):
...
...
This diff is collapsed.
Click to expand it.
tests/test_libmat2.py
+
31
−
0
View file @
96299c6a
...
...
@@ -138,6 +138,37 @@ class TestDeepCleaning(unittest.TestCase):
os
.
remove
(
'
./tests/data/clean.odt
'
)
class
TestLightWeightCleaning
(
unittest
.
TestCase
):
def
test_pdf
(
self
):
shutil
.
copy
(
'
./tests/data/dirty.pdf
'
,
'
./tests/data/clean.pdf
'
)
p
=
pdf
.
PDFParser
(
'
./tests/data/clean.pdf
'
)
meta
=
p
.
get_meta
()
self
.
assertEqual
(
meta
[
'
producer
'
],
'
pdfTeX-1.40.14
'
)
ret
=
p
.
remove_all_lightweight
()
self
.
assertTrue
(
ret
)
p
=
pdf
.
PDFParser
(
'
./tests/data/clean.pdf.cleaned
'
)
expected_meta
=
{
'
creation-date
'
:
-
1
,
'
format
'
:
'
PDF-1.5
'
,
'
mod-date
'
:
-
1
}
self
.
assertEqual
(
p
.
get_meta
(),
expected_meta
)
os
.
remove
(
'
./tests/data/clean.pdf
'
)
def
test_png
(
self
):
shutil
.
copy
(
'
./tests/data/dirty.png
'
,
'
./tests/data/clean.png
'
)
p
=
images
.
PNGParser
(
'
./tests/data/clean.png
'
)
meta
=
p
.
get_meta
()
self
.
assertEqual
(
meta
[
'
Comment
'
],
'
This is a comment, be careful!
'
)
ret
=
p
.
remove_all_lightweight
()
self
.
assertTrue
(
ret
)
p
=
images
.
PNGParser
(
'
./tests/data/clean.png.cleaned
'
)
self
.
assertEqual
(
p
.
get_meta
(),
{})
os
.
remove
(
'
./tests/data/clean.png
'
)
class
TestCleaning
(
unittest
.
TestCase
):
def
test_pdf
(
self
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment