From 9827f11944dc9e317ed9aad5d7ad64cea97465bb Mon Sep 17 00:00:00 2001
From: Alex Xu
Date: Fri, 15 May 2026 13:24:35 -0700
Subject: [PATCH] experimental epub input (#1090)
* experimental epub input
* fix missing spine items
* only extract first image on page
* re-organize
* fallback if naive spine extraction fails
* apply legacy extract option for epub too
---
README.md | 2 +-
gui/KCC.ui | 8 ++--
kindlecomicconverter/KCC_gui.py | 10 ++---
kindlecomicconverter/KCC_ui.py | 12 +++---
kindlecomicconverter/comic2ebook.py | 66 ++++++++++++++++++++++++++---
5 files changed, 73 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index ca7b327..4a89344 100644
--- a/README.md
+++ b/README.md
@@ -247,7 +247,7 @@ MAIN:
PROCESSING:
-n, --noprocessing Do not modify image and ignore any profile or processing option
- --pdfextract Use legacy PDF image extraction method from KCC 8 and earlier.
+ --legacyextract Use legacy PDF/EPUB image extraction method from earlier KCC versions.
--pdfwidth Render vector PDFs based on device width instead of height.
-u, --upscale Resize images smaller than device's resolution
-s, --stretch Stretch images to device's resolution
diff --git a/gui/KCC.ui b/gui/KCC.ui
index 9cc41d3..fffbe9b 100644
--- a/gui/KCC.ui
+++ b/gui/KCC.ui
@@ -750,14 +750,12 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
-
-
+
- Use the PDF image extraction method from KCC 8 and earlier.
-
-Useful for really weird PDFs.
+ <html><head/><body><p>Use the PDF/EPUB image extraction method from older KCC versions.</p><p><br/></p><p>Use if standard extraction fails for whatever reason.</p></body></html>
- PDF Legacy Extract
+ Legacy Extract
diff --git a/kindlecomicconverter/KCC_gui.py b/kindlecomicconverter/KCC_gui.py
index c25c290..802b73d 100644
--- a/kindlecomicconverter/KCC_gui.py
+++ b/kindlecomicconverter/KCC_gui.py
@@ -326,8 +326,8 @@ class WorkerThread(QThread):
options.maximizestrips = True
if GUI.disableProcessingBox.isChecked():
options.noprocessing = True
- if GUI.pdfExtractBox.isChecked():
- options.pdfextract = True
+ if GUI.legacyExtractBox.isChecked():
+ options.legacyextract = True
if GUI.pdfWidthBox.isChecked():
options.pdfwidth = True
if GUI.smartCoverCropBox.isChecked():
@@ -625,7 +625,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
GUI.jobList.clear()
if self.tar or self.sevenzip:
fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
- 'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.pdf);;All (*.*)')
+ 'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.epub *.pdf);;All (*.*)')
else:
fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
'Comic (*.pdf);;All (*.*)')
@@ -1080,7 +1080,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
'colorBox': GUI.colorBox.checkState(),
'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
'disableProcessingBox': GUI.disableProcessingBox.checkState(),
- 'pdfExtractBox': GUI.pdfExtractBox.checkState(),
+ 'legacyExtractBox': GUI.legacyExtractBox.checkState(),
'pdfWidthBox': GUI.pdfWidthBox.checkState(),
'smartCoverCropBox': GUI.smartCoverCropBox.checkState(),
'coverFillBox': GUI.coverFillBox.checkState(),
@@ -1120,7 +1120,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
GUI.jobList.clear()
formats = ['.pdf']
if self.tar or self.sevenzip:
- formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar'])
+ formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar', '.epub'])
if os.path.isdir(message):
GUI.jobList.addItem(message)
GUI.jobList.scrollToBottom()
diff --git a/kindlecomicconverter/KCC_ui.py b/kindlecomicconverter/KCC_ui.py
index 62e9d29..c762dab 100644
--- a/kindlecomicconverter/KCC_ui.py
+++ b/kindlecomicconverter/KCC_ui.py
@@ -389,10 +389,10 @@ class Ui_mainWindow(object):
self.gridLayout_2.addWidget(self.qualityBox, 1, 2, 1, 1)
- self.pdfExtractBox = QCheckBox(self.optionWidget)
- self.pdfExtractBox.setObjectName(u"pdfExtractBox")
+ self.legacyExtractBox = QCheckBox(self.optionWidget)
+ self.legacyExtractBox.setObjectName(u"legacyExtractBox")
- self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
+ self.gridLayout_2.addWidget(self.legacyExtractBox, 9, 0, 1, 1)
self.colorBox = QCheckBox(self.optionWidget)
self.colorBox.setObjectName(u"colorBox")
@@ -785,11 +785,9 @@ class Ui_mainWindow(object):
#endif // QT_CONFIG(tooltip)
self.qualityBox.setText(QCoreApplication.translate("mainWindow", u"Panel View 4/2/HQ", None))
#if QT_CONFIG(tooltip)
- self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
-"\n"
-"Useful for really weird PDFs.", None))
+ self.legacyExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"
Use the PDF/EPUB image extraction method from older KCC versions.
Use if standard extraction fails for whatever reason.
", None))
#endif // QT_CONFIG(tooltip)
- self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
+ self.legacyExtractBox.setText(QCoreApplication.translate("mainWindow", u"Legacy Extract", None))
#if QT_CONFIG(tooltip)
self.colorBox.setToolTip(QCoreApplication.translate("mainWindow", u"Disable conversion to grayscale.
", None))
#endif // QT_CONFIG(tooltip)
diff --git a/kindlecomicconverter/comic2ebook.py b/kindlecomicconverter/comic2ebook.py
index 6a92f1a..3b084d5 100755
--- a/kindlecomicconverter/comic2ebook.py
+++ b/kindlecomicconverter/comic2ebook.py
@@ -22,7 +22,9 @@ from collections import Counter
import os
import pathlib
import re
+import shutil
import sys
+import xml.etree.ElementTree as ET
from argparse import ArgumentParser
from time import perf_counter, strftime, gmtime
from copy import copy
@@ -917,7 +919,7 @@ def getWorkFolder(afile, workdir=None):
os.makedirs(fullPath)
path = workdir
sanitizePermissions(path)
- if options.pdfextract:
+ if options.legacyextract:
pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
njpg = pdf.extract()
if njpg == 0:
@@ -956,11 +958,61 @@ def getWorkFolder(afile, workdir=None):
for file in os.listdir(os.path.join(fullPath, tdir[0])):
move(os.path.join(fullPath, tdir[0], file), fullPath)
os.rmdir(os.path.join(fullPath, tdir[0]))
+
+ if options.legacyextract:
+ return workdir
+
+ if afile.lower().endswith('.epub'):
+ container = ET.parse(os.path.join(path, 'META-INF', 'container.xml'))
+ opf_path = container.find(r'.//{*}rootfile').attrib['full-path']
+ opf_path = os.path.join(path, opf_path)
+ opf = ET.parse(opf_path)
+ spine = []
+ for spine_item in opf.findall(r'.//{*}itemref'):
+ spine.append(spine_item.attrib.get('idref'))
+ manifest_dict = {}
+ for manifest_item in opf.findall(".//*[@media-type='application/xhtml+xml']"):
+ manifest_dict[manifest_item.attrib.get('id')] = manifest_item.attrib.get('href')
+ ordered_image_paths = []
+ for i, spine_item in enumerate(spine):
+ if spine_item not in manifest_dict:
+ continue
+ page_path = os.path.join(os.path.dirname(opf_path), manifest_dict[spine_item])
+ page = ET.parse(page_path)
+ imgs = page.findall(r'.//{*}img') + page.findall(r'.//{*}image')
+ img_path = None
+ # TODO handle more than first image
+ for img in imgs:
+ for key in img.attrib:
+ if 'src' in key or 'href' in key:
+ img_path = img.attrib[key]
+ if img_path.startswith('..'):
+ img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
+ else:
+ img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
+ break
+ # TODO empty image
+ if img_path:
+ ordered_image_paths.append(img_path)
+ # fallback if naive spine extraction fails
+ if not ordered_image_paths:
+ return workdir
+
+ if options.tempdir:
+ workdir2 = mkdtemp('', 'KCC-', os.path.dirname(afile))
+ else:
+ workdir2 = mkdtemp('', 'KCC-')
+ for i, img_path in enumerate(ordered_image_paths):
+ _, ext = os.path.splitext(img_path)
+ fullpath2 = os.path.join(workdir2, 'OEBPS', 'Images')
+ os.makedirs(fullpath2, exist_ok=True)
+ shutil.copyfile(img_path, os.path.join(fullpath2, f"{i}{ext}"))
+ rmtree(workdir, True)
+ return workdir2
+
return workdir
-
- except OSError as e:
- rmtree(workdir, True)
- raise UserWarning(e)
+ finally:
+ pass
else:
raise UserWarning("Failed to open source file/directory.")
@@ -1406,8 +1458,8 @@ def makeParser():
processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
help="Do not modify image and ignore any profile or processing option")
- processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
- help="Use the legacy PDF image extraction method from KCC 8 and earlier")
+ processing_options.add_argument("--legacyextract", action="store_true", dest="legacyextract", default=False,
+ help="Use the legacy PDF/EPUB image extraction method from older KCC versions")
processing_options.add_argument("--pdfwidth", action="store_true", dest="pdfwidth", default=False,
help="Render vector PDFs to device width instead of height.")
processing_options.add_argument("--smartcovercrop", action="store_true", dest="smartcovercrop", default=False,