From 9827f11944dc9e317ed9aad5d7ad64cea97465bb Mon Sep 17 00:00:00 2001 From: Alex Xu Date: Fri, 15 May 2026 13:24:35 -0700 Subject: [PATCH] experimental epub input (#1090) * experimental epub input * fix missing spine items * only extract first image on page * re-organize * fallback if naive spine extraction fails * apply legacy extract option for epub too --- README.md | 2 +- gui/KCC.ui | 8 ++-- kindlecomicconverter/KCC_gui.py | 10 ++--- kindlecomicconverter/KCC_ui.py | 12 +++--- kindlecomicconverter/comic2ebook.py | 66 ++++++++++++++++++++++++++--- 5 files changed, 73 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index ca7b327..4a89344 100644 --- a/README.md +++ b/README.md @@ -247,7 +247,7 @@ MAIN: PROCESSING: -n, --noprocessing Do not modify image and ignore any profile or processing option - --pdfextract Use legacy PDF image extraction method from KCC 8 and earlier. + --legacyextract Use legacy PDF/EPUB image extraction method from earlier KCC versions. --pdfwidth Render vector PDFs based on device width instead of height. -u, --upscale Resize images smaller than device's resolution -s, --stretch Stretch images to device's resolution diff --git a/gui/KCC.ui b/gui/KCC.ui index 9cc41d3..fffbe9b 100644 --- a/gui/KCC.ui +++ b/gui/KCC.ui @@ -750,14 +750,12 @@ Higher values are larger and higher quality, and may resolve blank page issues.< - + - Use the PDF image extraction method from KCC 8 and earlier. - -Useful for really weird PDFs. + <html><head/><body><p>Use the PDF/EPUB image extraction method from older KCC versions.</p><p><br/></p><p>Use if standard extraction fails for whatever reason.</p></body></html> - PDF Legacy Extract + Legacy Extract diff --git a/kindlecomicconverter/KCC_gui.py b/kindlecomicconverter/KCC_gui.py index c25c290..802b73d 100644 --- a/kindlecomicconverter/KCC_gui.py +++ b/kindlecomicconverter/KCC_gui.py @@ -326,8 +326,8 @@ class WorkerThread(QThread): options.maximizestrips = True if GUI.disableProcessingBox.isChecked(): options.noprocessing = True - if GUI.pdfExtractBox.isChecked(): - options.pdfextract = True + if GUI.legacyExtractBox.isChecked(): + options.legacyextract = True if GUI.pdfWidthBox.isChecked(): options.pdfwidth = True if GUI.smartCoverCropBox.isChecked(): @@ -625,7 +625,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow): GUI.jobList.clear() if self.tar or self.sevenzip: fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath, - 'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.pdf);;All (*.*)') + 'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.epub *.pdf);;All (*.*)') else: fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath, 'Comic (*.pdf);;All (*.*)') @@ -1080,7 +1080,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow): 'colorBox': GUI.colorBox.checkState(), 'eraseRainbowBox': GUI.eraseRainbowBox.checkState(), 'disableProcessingBox': GUI.disableProcessingBox.checkState(), - 'pdfExtractBox': GUI.pdfExtractBox.checkState(), + 'legacyExtractBox': GUI.legacyExtractBox.checkState(), 'pdfWidthBox': GUI.pdfWidthBox.checkState(), 'smartCoverCropBox': GUI.smartCoverCropBox.checkState(), 'coverFillBox': GUI.coverFillBox.checkState(), @@ -1120,7 +1120,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow): GUI.jobList.clear() formats = ['.pdf'] if self.tar or self.sevenzip: - formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar']) + formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar', '.epub']) if os.path.isdir(message): GUI.jobList.addItem(message) GUI.jobList.scrollToBottom() diff --git a/kindlecomicconverter/KCC_ui.py b/kindlecomicconverter/KCC_ui.py index 62e9d29..c762dab 100644 --- a/kindlecomicconverter/KCC_ui.py +++ b/kindlecomicconverter/KCC_ui.py @@ -389,10 +389,10 @@ class Ui_mainWindow(object): self.gridLayout_2.addWidget(self.qualityBox, 1, 2, 1, 1) - self.pdfExtractBox = QCheckBox(self.optionWidget) - self.pdfExtractBox.setObjectName(u"pdfExtractBox") + self.legacyExtractBox = QCheckBox(self.optionWidget) + self.legacyExtractBox.setObjectName(u"legacyExtractBox") - self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1) + self.gridLayout_2.addWidget(self.legacyExtractBox, 9, 0, 1, 1) self.colorBox = QCheckBox(self.optionWidget) self.colorBox.setObjectName(u"colorBox") @@ -785,11 +785,9 @@ class Ui_mainWindow(object): #endif // QT_CONFIG(tooltip) self.qualityBox.setText(QCoreApplication.translate("mainWindow", u"Panel View 4/2/HQ", None)) #if QT_CONFIG(tooltip) - self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n" -"\n" -"Useful for really weird PDFs.", None)) + self.legacyExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"

Use the PDF/EPUB image extraction method from older KCC versions.


Use if standard extraction fails for whatever reason.

", None)) #endif // QT_CONFIG(tooltip) - self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None)) + self.legacyExtractBox.setText(QCoreApplication.translate("mainWindow", u"Legacy Extract", None)) #if QT_CONFIG(tooltip) self.colorBox.setToolTip(QCoreApplication.translate("mainWindow", u"

Disable conversion to grayscale.

", None)) #endif // QT_CONFIG(tooltip) diff --git a/kindlecomicconverter/comic2ebook.py b/kindlecomicconverter/comic2ebook.py index 6a92f1a..3b084d5 100755 --- a/kindlecomicconverter/comic2ebook.py +++ b/kindlecomicconverter/comic2ebook.py @@ -22,7 +22,9 @@ from collections import Counter import os import pathlib import re +import shutil import sys +import xml.etree.ElementTree as ET from argparse import ArgumentParser from time import perf_counter, strftime, gmtime from copy import copy @@ -917,7 +919,7 @@ def getWorkFolder(afile, workdir=None): os.makedirs(fullPath) path = workdir sanitizePermissions(path) - if options.pdfextract: + if options.legacyextract: pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath) njpg = pdf.extract() if njpg == 0: @@ -956,11 +958,61 @@ def getWorkFolder(afile, workdir=None): for file in os.listdir(os.path.join(fullPath, tdir[0])): move(os.path.join(fullPath, tdir[0], file), fullPath) os.rmdir(os.path.join(fullPath, tdir[0])) + + if options.legacyextract: + return workdir + + if afile.lower().endswith('.epub'): + container = ET.parse(os.path.join(path, 'META-INF', 'container.xml')) + opf_path = container.find(r'.//{*}rootfile').attrib['full-path'] + opf_path = os.path.join(path, opf_path) + opf = ET.parse(opf_path) + spine = [] + for spine_item in opf.findall(r'.//{*}itemref'): + spine.append(spine_item.attrib.get('idref')) + manifest_dict = {} + for manifest_item in opf.findall(".//*[@media-type='application/xhtml+xml']"): + manifest_dict[manifest_item.attrib.get('id')] = manifest_item.attrib.get('href') + ordered_image_paths = [] + for i, spine_item in enumerate(spine): + if spine_item not in manifest_dict: + continue + page_path = os.path.join(os.path.dirname(opf_path), manifest_dict[spine_item]) + page = ET.parse(page_path) + imgs = page.findall(r'.//{*}img') + page.findall(r'.//{*}image') + img_path = None + # TODO handle more than first image + for img in imgs: + for key in img.attrib: + if 'src' in key or 'href' in key: + img_path = img.attrib[key] + if img_path.startswith('..'): + img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path) + else: + img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path) + break + # TODO empty image + if img_path: + ordered_image_paths.append(img_path) + # fallback if naive spine extraction fails + if not ordered_image_paths: + return workdir + + if options.tempdir: + workdir2 = mkdtemp('', 'KCC-', os.path.dirname(afile)) + else: + workdir2 = mkdtemp('', 'KCC-') + for i, img_path in enumerate(ordered_image_paths): + _, ext = os.path.splitext(img_path) + fullpath2 = os.path.join(workdir2, 'OEBPS', 'Images') + os.makedirs(fullpath2, exist_ok=True) + shutil.copyfile(img_path, os.path.join(fullpath2, f"{i}{ext}")) + rmtree(workdir, True) + return workdir2 + return workdir - - except OSError as e: - rmtree(workdir, True) - raise UserWarning(e) + finally: + pass else: raise UserWarning("Failed to open source file/directory.") @@ -1406,8 +1458,8 @@ def makeParser(): processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False, help="Do not modify image and ignore any profile or processing option") - processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False, - help="Use the legacy PDF image extraction method from KCC 8 and earlier") + processing_options.add_argument("--legacyextract", action="store_true", dest="legacyextract", default=False, + help="Use the legacy PDF/EPUB image extraction method from older KCC versions") processing_options.add_argument("--pdfwidth", action="store_true", dest="pdfwidth", default=False, help="Render vector PDFs to device width instead of height.") processing_options.add_argument("--smartcovercrop", action="store_true", dest="smartcovercrop", default=False,