1
0
mirror of https://github.com/ciromattia/kcc synced 2026-05-16 04:21:42 +00:00

experimental epub input (#1090)

* experimental epub input

* fix missing spine items

* only extract first image on page

* re-organize

* fallback if naive spine extraction fails

* apply legacy extract option for epub too
This commit is contained in:
Alex Xu
2026-05-15 13:24:35 -07:00
committed by GitHub
parent 8030884148
commit 9827f11944
5 changed files with 73 additions and 25 deletions

View File

@@ -247,7 +247,7 @@ MAIN:
PROCESSING:
-n, --noprocessing Do not modify image and ignore any profile or processing option
--pdfextract Use legacy PDF image extraction method from KCC 8 and earlier.
--legacyextract Use legacy PDF/EPUB image extraction method from earlier KCC versions.
--pdfwidth Render vector PDFs based on device width instead of height.
-u, --upscale Resize images smaller than device's resolution
-s, --stretch Stretch images to device's resolution

View File

@@ -750,14 +750,12 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
</widget>
</item>
<item row="9" column="0">
<widget class="QCheckBox" name="pdfExtractBox">
<widget class="QCheckBox" name="legacyExtractBox">
<property name="toolTip">
<string>Use the PDF image extraction method from KCC 8 and earlier.
Useful for really weird PDFs.</string>
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Use the PDF/EPUB image extraction method from older KCC versions.&lt;/p&gt;&lt;p&gt;&lt;br/&gt;&lt;/p&gt;&lt;p&gt;Use if standard extraction fails for whatever reason.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="text">
<string>PDF Legacy Extract</string>
<string>Legacy Extract</string>
</property>
</widget>
</item>

View File

@@ -326,8 +326,8 @@ class WorkerThread(QThread):
options.maximizestrips = True
if GUI.disableProcessingBox.isChecked():
options.noprocessing = True
if GUI.pdfExtractBox.isChecked():
options.pdfextract = True
if GUI.legacyExtractBox.isChecked():
options.legacyextract = True
if GUI.pdfWidthBox.isChecked():
options.pdfwidth = True
if GUI.smartCoverCropBox.isChecked():
@@ -625,7 +625,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
GUI.jobList.clear()
if self.tar or self.sevenzip:
fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.pdf);;All (*.*)')
'Comic (*.cbz *.cbr *.cb7 *.zip *.rar *.7z *.epub *.pdf);;All (*.*)')
else:
fnames = QFileDialog.getOpenFileNames(MW, 'Select file', self.lastPath,
'Comic (*.pdf);;All (*.*)')
@@ -1080,7 +1080,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
'colorBox': GUI.colorBox.checkState(),
'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
'disableProcessingBox': GUI.disableProcessingBox.checkState(),
'pdfExtractBox': GUI.pdfExtractBox.checkState(),
'legacyExtractBox': GUI.legacyExtractBox.checkState(),
'pdfWidthBox': GUI.pdfWidthBox.checkState(),
'smartCoverCropBox': GUI.smartCoverCropBox.checkState(),
'coverFillBox': GUI.coverFillBox.checkState(),
@@ -1120,7 +1120,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
GUI.jobList.clear()
formats = ['.pdf']
if self.tar or self.sevenzip:
formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar'])
formats.extend(['.cb7', '.7z', '.cbz', '.zip', '.cbr', '.rar', '.epub'])
if os.path.isdir(message):
GUI.jobList.addItem(message)
GUI.jobList.scrollToBottom()

View File

@@ -389,10 +389,10 @@ class Ui_mainWindow(object):
self.gridLayout_2.addWidget(self.qualityBox, 1, 2, 1, 1)
self.pdfExtractBox = QCheckBox(self.optionWidget)
self.pdfExtractBox.setObjectName(u"pdfExtractBox")
self.legacyExtractBox = QCheckBox(self.optionWidget)
self.legacyExtractBox.setObjectName(u"legacyExtractBox")
self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
self.gridLayout_2.addWidget(self.legacyExtractBox, 9, 0, 1, 1)
self.colorBox = QCheckBox(self.optionWidget)
self.colorBox.setObjectName(u"colorBox")
@@ -785,11 +785,9 @@ class Ui_mainWindow(object):
#endif // QT_CONFIG(tooltip)
self.qualityBox.setText(QCoreApplication.translate("mainWindow", u"Panel View 4/2/HQ", None))
#if QT_CONFIG(tooltip)
self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
"\n"
"Useful for really weird PDFs.", None))
self.legacyExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"<html><head/><body><p>Use the PDF/EPUB image extraction method from older KCC versions.</p><p><br/></p><p>Use if standard extraction fails for whatever reason.</p></body></html>", None))
#endif // QT_CONFIG(tooltip)
self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
self.legacyExtractBox.setText(QCoreApplication.translate("mainWindow", u"Legacy Extract", None))
#if QT_CONFIG(tooltip)
self.colorBox.setToolTip(QCoreApplication.translate("mainWindow", u"<html><head/><body><p style='white-space:pre'>Disable conversion to grayscale.</p></body></html>", None))
#endif // QT_CONFIG(tooltip)

View File

@@ -22,7 +22,9 @@ from collections import Counter
import os
import pathlib
import re
import shutil
import sys
import xml.etree.ElementTree as ET
from argparse import ArgumentParser
from time import perf_counter, strftime, gmtime
from copy import copy
@@ -917,7 +919,7 @@ def getWorkFolder(afile, workdir=None):
os.makedirs(fullPath)
path = workdir
sanitizePermissions(path)
if options.pdfextract:
if options.legacyextract:
pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
njpg = pdf.extract()
if njpg == 0:
@@ -956,11 +958,61 @@ def getWorkFolder(afile, workdir=None):
for file in os.listdir(os.path.join(fullPath, tdir[0])):
move(os.path.join(fullPath, tdir[0], file), fullPath)
os.rmdir(os.path.join(fullPath, tdir[0]))
if options.legacyextract:
return workdir
if afile.lower().endswith('.epub'):
container = ET.parse(os.path.join(path, 'META-INF', 'container.xml'))
opf_path = container.find(r'.//{*}rootfile').attrib['full-path']
opf_path = os.path.join(path, opf_path)
opf = ET.parse(opf_path)
spine = []
for spine_item in opf.findall(r'.//{*}itemref'):
spine.append(spine_item.attrib.get('idref'))
manifest_dict = {}
for manifest_item in opf.findall(".//*[@media-type='application/xhtml+xml']"):
manifest_dict[manifest_item.attrib.get('id')] = manifest_item.attrib.get('href')
ordered_image_paths = []
for i, spine_item in enumerate(spine):
if spine_item not in manifest_dict:
continue
page_path = os.path.join(os.path.dirname(opf_path), manifest_dict[spine_item])
page = ET.parse(page_path)
imgs = page.findall(r'.//{*}img') + page.findall(r'.//{*}image')
img_path = None
# TODO handle more than first image
for img in imgs:
for key in img.attrib:
if 'src' in key or 'href' in key:
img_path = img.attrib[key]
if img_path.startswith('..'):
img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
else:
img_path = os.path.join(os.path.dirname(opf_path), os.path.dirname(manifest_dict[spine_item]), img_path)
break
# TODO empty image
if img_path:
ordered_image_paths.append(img_path)
# fallback if naive spine extraction fails
if not ordered_image_paths:
return workdir
if options.tempdir:
workdir2 = mkdtemp('', 'KCC-', os.path.dirname(afile))
else:
workdir2 = mkdtemp('', 'KCC-')
for i, img_path in enumerate(ordered_image_paths):
_, ext = os.path.splitext(img_path)
fullpath2 = os.path.join(workdir2, 'OEBPS', 'Images')
os.makedirs(fullpath2, exist_ok=True)
shutil.copyfile(img_path, os.path.join(fullpath2, f"{i}{ext}"))
rmtree(workdir, True)
return workdir2
return workdir
except OSError as e:
rmtree(workdir, True)
raise UserWarning(e)
finally:
pass
else:
raise UserWarning("Failed to open source file/directory.")
@@ -1406,8 +1458,8 @@ def makeParser():
processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
help="Do not modify image and ignore any profile or processing option")
processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
help="Use the legacy PDF image extraction method from KCC 8 and earlier")
processing_options.add_argument("--legacyextract", action="store_true", dest="legacyextract", default=False,
help="Use the legacy PDF/EPUB image extraction method from older KCC versions")
processing_options.add_argument("--pdfwidth", action="store_true", dest="pdfwidth", default=False,
help="Render vector PDFs to device width instead of height.")
processing_options.add_argument("--smartcovercrop", action="store_true", dest="smartcovercrop", default=False,