Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
jvoisin
mat2
Commits
ec082d64
Commit
ec082d64
authored
Feb 07, 2021
by
jvoisin
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Improve a bit the support of epub
parent
f8111547
Pipeline
#54869
failed with stages
in 3 minutes and 35 seconds
Changes
1
Pipelines
4
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
26 additions
and
2 deletions
+26
-2
libmat2/epub.py
libmat2/epub.py
+26
-2
No files found.
libmat2/epub.py
View file @
ec082d64
...
...
@@ -16,11 +16,17 @@ class EPUBParser(archive.ZipParser):
'mimetype'
,
'OEBPS/content.opf'
,
'content.opf'
,
'hmh.opf'
,
'OPS/.+.xml'
}))
self
.
files_to_omit
=
set
(
map
(
re
.
compile
,
{
# type: ignore
'iTunesMetadata.plist'
'META-INF/calibre_bookmarks.txt'
}))
self
.
uniqid
=
uuid
.
uuid4
()
def
_specific_get_meta
(
self
,
full_path
,
file_path
):
if
not
file_path
.
endswith
(
'
content
.opf'
):
if
not
file_path
.
endswith
(
'.opf'
):
return
{}
with
open
(
full_path
,
encoding
=
'utf-8'
)
as
f
:
...
...
@@ -32,12 +38,30 @@ class EPUBParser(archive.ZipParser):
return
{
file_path
:
'harmful content'
,
}
def
_specific_cleanup
(
self
,
full_path
:
str
):
if
full_path
.
endswith
(
'content.opf'
):
if
full_path
.
endswith
(
'hmh.opf'
)
or
full_path
.
endswith
(
'content.opf'
):
return
self
.
__handle_contentopf
(
full_path
)
elif
full_path
.
endswith
(
'OEBPS/toc.ncx'
):
return
self
.
__handle_tocncx
(
full_path
)
elif
re
.
search
(
'/OPS/[^/]+.xml$'
,
full_path
):
return
self
.
__handle_ops_xml
(
full_path
)
return
True
def
__handle_ops_xml
(
self
,
full_path
:
str
):
try
:
tree
,
namespace
=
office
.
_parse_xml
(
full_path
)
except
ET
.
ParseError
:
# pragma: nocover
logging
.
error
(
"Unable to parse %s in %s."
,
full_path
,
self
.
filename
)
return
False
for
item
in
tree
.
iterfind
(
'.//'
,
namespace
):
# pragma: nocover
if
item
.
tag
.
strip
().
lower
().
endswith
(
'head'
):
item
.
clear
()
break
tree
.
write
(
full_path
,
xml_declaration
=
True
,
encoding
=
'utf-8'
,
short_empty_elements
=
False
)
return
True
def
__handle_tocncx
(
self
,
full_path
:
str
):
try
:
tree
,
namespace
=
office
.
_parse_xml
(
full_path
)
...
...
jvoisin
@jvoisin
mentioned in issue
#148
·
Feb 07, 2021
mentioned in issue
#148
mentioned in issue #148
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment