From 9827f11944dc9e317ed9aad5d7ad64cea97465bb Mon Sep 17 00:00:00 2001
From: Alex Xu <alexkurosakimh3@gmail.com>
Date: Fri, 15 May 2026 13:24:35 -0700
Subject: [PATCH] experimental epub input (#1090)

* experimental epub input

* fix missing spine items

* only extract first image on page

* re-organize

* fallback if naive spine extraction fails

* apply legacy extract option for epub too
---
 README.md                           |  2 +-
 gui/KCC.ui                          |  8 ++--
 kindlecomicconverter/KCC_gui.py     | 10 ++---
 kindlecomicconverter/KCC_ui.py      | 12 +++---
 kindlecomicconverter/comic2ebook.py | 66 ++++++++++++++++++++++++++---
 5 files changed, 73 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index ca7b327..4a89344 100644
--- a/README.md
+++ b/README.md
@@ -247,7 +247,7 @@ MAIN:
 
 PROCESSING:
   -n, --noprocessing    Do not modify image and ignore any profile or processing option
-  --pdfextract          Use legacy PDF image extraction method from KCC 8 and earlier.
+  --legacyextract       Use legacy PDF/EPUB image extraction method from earlier KCC versions.
   --pdfwidth            Render vector PDFs based on device width instead of height.
   -u, --upscale         Resize images smaller than device's resolution
   -s, --stretch         Stretch images to device's resolution
diff --git a/gui/KCC.ui b/gui/KCC.ui
index 9cc41d3..fffbe9b 100644
--- a/gui/KCC.ui
+++ b/gui/KCC.ui
@@ -750,14 +750,12 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
         </widget>
        </item>
        <item row="9" column="0">
-        <widget class="QCheckBox" name="pdfExtractBox">
+        <widget class="QCheckBox" name="legacyExtractBox">
          <property name="toolTip">
-          <string>Use the PDF image extraction method from KCC 8 and earlier.
-
-Useful for really weird PDFs.</string>
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Use the PDF/EPUB image extraction method from older KCC versions.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Use if standard extraction fails for whatever reason.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
          </property>
          <property name="text">
-          <string>PDF Legacy Extract</string>
+          <string>Legacy Extract</string>
          </property>
         </widget>
        </item>
diff --git a/kindlecomicconverter/KCC_gui.py b/kindlecomicconverter/KCC_gui.py
index c25c290..802b73d 100644
--- a/kindlecomicconverter/KCC_gui.py
+++ b/kindlecomicconverter/KCC_gui.py
@@ -326,8 +326,8 @@ class WorkerThread(QThread):
             options.maximizestrips = True
         if GUI.disableProcessingBox.isChecked():
             options.noprocessing = True
-        if GUI.pdfExtractBox.isChecked():
-            options.pdfextract = True
+        if GUI.legacyExtractBox.isChecked():
+            options.legacyextract = True
         if GUI.pdfWidthBox.isChecked():
             options.pdfwidth = True
         if GUI.smartCoverCropBox.isChecked():
@@ -625,7 +625,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
             GUI.jobList.clear()
         if self.tar or self.sevenzip:
             fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
-                                                            'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.pdf);;All (*.*)')
+                                                            'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.epub *.pdf);;All (*.*)')
         else:
             fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
                                                             'Comic (*.pdf);;All (*.*)')
@@ -1080,7 +1080,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
                                            'colorBox': GUI.colorBox.checkState(),
                                            'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
                                            'disableProcessingBox': GUI.disableProcessingBox.checkState(),
-                                           'pdfExtractBox': GUI.pdfExtractBox.checkState(),
+                                           'legacyExtractBox': GUI.legacyExtractBox.checkState(),
                                            'pdfWidthBox': GUI.pdfWidthBox.checkState(),
                                            'smartCoverCropBox': GUI.smartCoverCropBox.checkState(),
                                            'coverFillBox': GUI.coverFillBox.checkState(),
@@ -1120,7 +1120,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
                 GUI.jobList.clear()
             formats = ['.pdf']
             if self.tar or self.sevenzip:
-                formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar'])
+                formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar', '.epub'])
             if os.path.isdir(message):
                 GUI.jobList.addItem(message)
                 GUI.jobList.scrollToBottom()
diff --git a/kindlecomicconverter/KCC_ui.py b/kindlecomicconverter/KCC_ui.py
index 62e9d29..c762dab 100644
--- a/kindlecomicconverter/KCC_ui.py
+++ b/kindlecomicconverter/KCC_ui.py
@@ -389,10 +389,10 @@ class Ui_mainWindow(object):
 
         self.gridLayout_2.addWidget(self.qualityBox, 1, 2, 1, 1)
 
-        self.pdfExtractBox = QCheckBox(self.optionWidget)
-        self.pdfExtractBox.setObjectName(u"pdfExtractBox")
+        self.legacyExtractBox = QCheckBox(self.optionWidget)
+        self.legacyExtractBox.setObjectName(u"legacyExtractBox")
 
-        self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
+        self.gridLayout_2.addWidget(self.legacyExtractBox, 9, 0, 1, 1)
 
         self.colorBox = QCheckBox(self.optionWidget)
         self.colorBox.setObjectName(u"colorBox")
@@ -785,11 +785,9 @@ class Ui_mainWindow(object):
 #endif // QT_CONFIG(tooltip)
         self.qualityBox.setText(QCoreApplication.translate("mainWindow", u"Panel View 4/2/HQ", None))
 #if QT_CONFIG(tooltip)
-        self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
-"\n"
-"Useful for really weird PDFs.", None))
+        self.legacyExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"<html><head/><body><p>Use the PDF/EPUB image extraction method from older KCC versions.</p><p><br/></p><p>Use if standard extraction fails for whatever reason.</p></body></html>", None))
 #endif // QT_CONFIG(tooltip)
-        self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
+        self.legacyExtractBox.setText(QCoreApplication.translate("mainWindow", u"Legacy Extract", None))
 #if QT_CONFIG(tooltip)
         self.colorBox.setToolTip(QCoreApplication.translate("mainWindow", u"<html><head/><body><p style='white-space:pre'>Disable conversion to grayscale.</p></body></html>", None))
 #endif // QT_CONFIG(tooltip)
diff --git a/kindlecomicconverter/comic2ebook.py b/kindlecomicconverter/comic2ebook.py
index 6a92f1a..3b084d5 100755
--- a/kindlecomicconverter/comic2ebook.py
+++ b/kindlecomicconverter/comic2ebook.py
@@ -22,7 +22,9 @@ from collections import Counter
 import os
 import pathlib
 import re
+import shutil
 import sys
+import xml.etree.ElementTree as ET
 from argparse import ArgumentParser
 from time import perf_counter, strftime, gmtime
 from copy import copy
@@ -917,7 +919,7 @@ def getWorkFolder(afile, workdir=None):
                 os.makedirs(fullPath)
             path = workdir
             sanitizePermissions(path)
-            if options.pdfextract:
+            if options.legacyextract:
                 pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
                 njpg = pdf.extract()
                 if njpg == 0:
@@ -956,11 +958,61 @@ def getWorkFolder(afile, workdir=None):
                     for file in os.listdir(os.path.join(fullPath, tdir[0])):
                         move(os.path.join(fullPath, tdir[0], file), fullPath)
                     os.rmdir(os.path.join(fullPath, tdir[0]))
+
+                if options.legacyextract:
+                    return workdir
+
+                if afile.lower().endswith('.epub'):
+                    container = ET.parse(os.path.join(path, 'META-INF', 'container.xml'))
+                    opf_path = container.find(r'.//{*}rootfile').attrib['full-path']
+                    opf_path = os.path.join(path, opf_path)
+                    opf = ET.parse(opf_path)
+                    spine = []
+                    for spine_item in opf.findall(r'.//{*}itemref'):
+                        spine.append(spine_item.attrib.get('idref'))
+                    manifest_dict = {}
+                    for manifest_item in opf.findall(".//*[@media-type='application/xhtml+xml']"):
+                        manifest_dict[manifest_item.attrib.get('id')] = manifest_item.attrib.get('href')
+                    ordered_image_paths = []
+                    for i, spine_item in enumerate(spine):
+                        if spine_item not in manifest_dict:
+                            continue
+                        page_path = os.path.join(os.path.dirname(opf_path), manifest_dict[spine_item])
+                        page = ET.parse(page_path)
+                        imgs = page.findall(r'.//{*}img') + page.findall(r'.//{*}image')
+                        img_path = None
+                        # TODO handle more than first image
+                        for img in imgs:
+                            for key in img.attrib:
+                                if 'src' in key or 'href' in key:
+                                    img_path = img.attrib[key]
+                                    if img_path.startswith('..'):
+                                        img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
+                                    else:
+                                        img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
+                            break
+                        # TODO empty image
+                        if img_path:
+                            ordered_image_paths.append(img_path)
+                    # fallback if naive spine extraction fails
+                    if not ordered_image_paths:
+                        return workdir
+
+                    if options.tempdir:
+                        workdir2 = mkdtemp('', 'KCC-', os.path.dirname(afile))
+                    else:
+                        workdir2 = mkdtemp('', 'KCC-')
+                    for i, img_path in enumerate(ordered_image_paths):
+                        _, ext = os.path.splitext(img_path)
+                        fullpath2 = os.path.join(workdir2, 'OEBPS', 'Images')
+                        os.makedirs(fullpath2, exist_ok=True)
+                        shutil.copyfile(img_path, os.path.join(fullpath2, f"{i}{ext}"))
+                    rmtree(workdir, True)
+                    return workdir2
+                
                 return workdir
- 
-            except OSError as e:
-                rmtree(workdir, True)
-                raise UserWarning(e)
+            finally:
+                pass
     else:
         raise UserWarning("Failed to open source file/directory.")
 
@@ -1406,8 +1458,8 @@ def makeParser():
 
     processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
                                     help="Do not modify image and ignore any profile or processing option")
-    processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
-                                    help="Use the legacy PDF image extraction method from KCC 8 and earlier")
+    processing_options.add_argument("--legacyextract", action="store_true", dest="legacyextract", default=False,
+                                    help="Use the legacy PDF/EPUB image extraction method from older KCC versions")
     processing_options.add_argument("--pdfwidth", action="store_true", dest="pdfwidth", default=False,
                                     help="Render vector PDFs to device width instead of height.")
     processing_options.add_argument("--smartcovercrop", action="store_true", dest="smartcovercrop", default=False,