experimental epub input (#1090)

* experimental epub input * fix missing spine items * only extract first image on page * re-organize * fallback if naive spine extraction fails * apply legacy extract option for epub too
2026-06-30 10:05:25 +00:00 · 2026-05-15 13:24:35 -07:00
parent 8030884148
commit 9827f11944
5 changed files with 73 additions and 25 deletions
@@ -247,7 +247,7 @@ MAIN:

 PROCESSING:
  -n, --noprocessing    Do not modify image and ignore any profile or processing option
-  --pdfextract          Use legacy PDF image extraction method from KCC 8 and earlier.
+  --legacyextract       Use legacy PDF/EPUB image extraction method from earlier KCC versions.
  --pdfwidth            Render vector PDFs based on device width instead of height.
  -u, --upscale         Resize images smaller than device's resolution
  -s, --stretch         Stretch images to device's resolution
@@ -750,14 +750,12 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
        </widget>
       </item>
       <item row="9" column="0">
-        <widget class="QCheckBox" name="pdfExtractBox">
+        <widget class="QCheckBox" name="legacyExtractBox">
         <property name="toolTip">
-          <string>Use the PDF image extraction method from KCC 8 and earlier.
-
-Useful for really weird PDFs.</string>
+          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Use the PDF/EPUB image extraction method from older KCC versions.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Use if standard extraction fails for whatever reason.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
         </property>
         <property name="text">
-          <string>PDF Legacy Extract</string>
+          <string>Legacy Extract</string>
         </property>
        </widget>
       </item>
@@ -326,8 +326,8 @@ class WorkerThread(QThread):
            options.maximizestrips = True
        if GUI.disableProcessingBox.isChecked():
            options.noprocessing = True
-        if GUI.pdfExtractBox.isChecked():
-            options.pdfextract = True
+        if GUI.legacyExtractBox.isChecked():
+            options.legacyextract = True
        if GUI.pdfWidthBox.isChecked():
            options.pdfwidth = True
        if GUI.smartCoverCropBox.isChecked():
@@ -625,7 +625,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
            GUI.jobList.clear()
        if self.tar or self.sevenzip:
            fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
-                                                            'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.pdf);;All (*.*)')
+                                                            'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.epub *.pdf);;All (*.*)')
        else:
            fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
                                                            'Comic (*.pdf);;All (*.*)')
@@ -1080,7 +1080,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
                                           'colorBox': GUI.colorBox.checkState(),
                                           'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
                                           'disableProcessingBox': GUI.disableProcessingBox.checkState(),
-                                           'pdfExtractBox': GUI.pdfExtractBox.checkState(),
+                                           'legacyExtractBox': GUI.legacyExtractBox.checkState(),
                                           'pdfWidthBox': GUI.pdfWidthBox.checkState(),
                                           'smartCoverCropBox': GUI.smartCoverCropBox.checkState(),
                                           'coverFillBox': GUI.coverFillBox.checkState(),
@@ -1120,7 +1120,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
                GUI.jobList.clear()
            formats = ['.pdf']
            if self.tar or self.sevenzip:
-                formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar'])
+                formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar', '.epub'])
            if os.path.isdir(message):
                GUI.jobList.addItem(message)
                GUI.jobList.scrollToBottom()
@@ -389,10 +389,10 @@ class Ui_mainWindow(object):

        self.gridLayout_2.addWidget(self.qualityBox, 1, 2, 1, 1)

-        self.pdfExtractBox = QCheckBox(self.optionWidget)
-        self.pdfExtractBox.setObjectName(u"pdfExtractBox")
+        self.legacyExtractBox = QCheckBox(self.optionWidget)
+        self.legacyExtractBox.setObjectName(u"legacyExtractBox")

-        self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
+        self.gridLayout_2.addWidget(self.legacyExtractBox, 9, 0, 1, 1)

        self.colorBox = QCheckBox(self.optionWidget)
        self.colorBox.setObjectName(u"colorBox")
@@ -785,11 +785,9 @@ class Ui_mainWindow(object):
 #endif // QT_CONFIG(tooltip)
        self.qualityBox.setText(QCoreApplication.translate("mainWindow", u"Panel View 4/2/HQ", None))
 #if QT_CONFIG(tooltip)
-        self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
-"\n"
-"Useful for really weird PDFs.", None))
+        self.legacyExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"<html><head/><body><p>Use the PDF/EPUB image extraction method from older KCC versions.</p><p><br/></p><p>Use if standard extraction fails for whatever reason.</p></body></html>", None))
 #endif // QT_CONFIG(tooltip)
-        self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
+        self.legacyExtractBox.setText(QCoreApplication.translate("mainWindow", u"Legacy Extract", None))
 #if QT_CONFIG(tooltip)
        self.colorBox.setToolTip(QCoreApplication.translate("mainWindow", u"<html><head/><body><p style='white-space:pre'>Disable conversion to grayscale.</p></body></html>", None))
 #endif // QT_CONFIG(tooltip)
@@ -22,7 +22,9 @@ from collections import Counter
 import os
 import pathlib
 import re
+import shutil
 import sys
+import xml.etree.ElementTree as ET
 from argparse import ArgumentParser
 from time import perf_counter, strftime, gmtime
 from copy import copy
@@ -917,7 +919,7 @@ def getWorkFolder(afile, workdir=None):
                os.makedirs(fullPath)
            path = workdir
            sanitizePermissions(path)
-            if options.pdfextract:
+            if options.legacyextract:
                pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
                njpg = pdf.extract()
                if njpg == 0:
@@ -956,11 +958,61 @@ def getWorkFolder(afile, workdir=None):
                    for file in os.listdir(os.path.join(fullPath, tdir[0])):
                        move(os.path.join(fullPath, tdir[0], file), fullPath)
                    os.rmdir(os.path.join(fullPath, tdir[0]))
+
+                if options.legacyextract:
+                    return workdir
+
+                if afile.lower().endswith('.epub'):
+                    container = ET.parse(os.path.join(path, 'META-INF', 'container.xml'))
+                    opf_path = container.find(r'.//{*}rootfile').attrib['full-path']
+                    opf_path = os.path.join(path, opf_path)
+                    opf = ET.parse(opf_path)
+                    spine = []
+                    for spine_item in opf.findall(r'.//{*}itemref'):
+                        spine.append(spine_item.attrib.get('idref'))
+                    manifest_dict = {}
+                    for manifest_item in opf.findall(".//*[@media-type='application/xhtml+xml']"):
+                        manifest_dict[manifest_item.attrib.get('id')] = manifest_item.attrib.get('href')
+                    ordered_image_paths = []
+                    for i, spine_item in enumerate(spine):
+                        if spine_item not in manifest_dict:
+                            continue
+                        page_path = os.path.join(os.path.dirname(opf_path), manifest_dict[spine_item])
+                        page = ET.parse(page_path)
+                        imgs = page.findall(r'.//{*}img') + page.findall(r'.//{*}image')
+                        img_path = None
+                        # TODO handle more than first image
+                        for img in imgs:
+                            for key in img.attrib:
+                                if 'src' in key or 'href' in key:
+                                    img_path = img.attrib[key]
+                                    if img_path.startswith('..'):
+                                        img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
+                                    else:
+                                        img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
+                            break
+                        # TODO empty image
+                        if img_path:
+                            ordered_image_paths.append(img_path)
+                    # fallback if naive spine extraction fails
+                    if not ordered_image_paths:
+                        return workdir
+
+                    if options.tempdir:
+                        workdir2 = mkdtemp('', 'KCC-', os.path.dirname(afile))
+                    else:
+                        workdir2 = mkdtemp('', 'KCC-')
+                    for i, img_path in enumerate(ordered_image_paths):
+                        _, ext = os.path.splitext(img_path)
+                        fullpath2 = os.path.join(workdir2, 'OEBPS', 'Images')
+                        os.makedirs(fullpath2, exist_ok=True)
+                        shutil.copyfile(img_path, os.path.join(fullpath2, f"{i}{ext}"))
+                    rmtree(workdir, True)
+                    return workdir2
+                
                return workdir
- 
-            except OSError as e:
-                rmtree(workdir, True)
-                raise UserWarning(e)
+            finally:
+                pass
    else:
        raise UserWarning("Failed to open source file/directory.")

@@ -1406,8 +1458,8 @@ def makeParser():

    processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
                                    help="Do not modify image and ignore any profile or processing option")
-    processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
-                                    help="Use the legacy PDF image extraction method from KCC 8 and earlier")
+    processing_options.add_argument("--legacyextract", action="store_true", dest="legacyextract", default=False,
+                                    help="Use the legacy PDF/EPUB image extraction method from older KCC versions")
    processing_options.add_argument("--pdfwidth", action="store_true", dest="pdfwidth", default=False,
                                    help="Render vector PDFs to device width instead of height.")
    processing_options.add_argument("--smartcovercrop", action="store_true", dest="smartcovercrop", default=False,