Add legacy pdf image extract option (#1225)

2026-01-25 22:47:28 +00:00 · 2026-01-25 13:41:43 -08:00
parent f63387cae4
commit 9a4143ce62
6 changed files with 111 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -238,6 +238,7 @@ MAIN:

 PROCESSING:
  -n, --noprocessing    Do not modify image and ignore any profile or processing option
+  --pdfextract          Use legacy PDF image extraction method from KCC 8 and earlier.
  -u, --upscale         Resize images smaller than device's resolution
  -s, --stretch         Stretch images to device's resolution
  -r SPLITTER, --splitter SPLITTER
--- a/gui/KCC.ui
+++ b/gui/KCC.ui
@@ -896,6 +896,18 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
         </property>
        </widget>
       </item>
+       <item row="9" column="0">
+        <widget class="QCheckBox" name="pdfExtractBox">
+         <property name="toolTip">
+          <string>Use the PDF image extraction method from KCC 8 and earlier.
+
+Useful for really weird PDFs.</string>
+         </property>
+         <property name="text">
+          <string>PDF Legacy Extract</string>
+         </property>
+        </widget>
+       </item>
      </layout>
     </widget>
    </item>
--- a/kindlecomicconverter/KCC_gui.py
+++ b/kindlecomicconverter/KCC_gui.py
@@ -327,6 +327,8 @@ class WorkerThread(QThread):
            options.maximizestrips = True
        if GUI.disableProcessingBox.isChecked():
            options.noprocessing = True
+        if GUI.pdfExtractBox.isChecked():
+            options.pdfextract = True
        if GUI.metadataTitleBox.checkState() == Qt.CheckState.PartiallyChecked:
            options.metadatatitle = 1
        elif GUI.metadataTitleBox.checkState() == Qt.CheckState.Checked:
@@ -1032,6 +1034,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
                                           'colorBox': GUI.colorBox.checkState(),
                                           'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
                                           'disableProcessingBox': GUI.disableProcessingBox.checkState(),
+                                           'pdfExtractBox': GUI.pdfExtractBox.checkState(),
                                           'metadataTitleBox': GUI.metadataTitleBox.checkState(),
                                           'mozJpegBox': GUI.mozJpegBox.checkState(),
                                           'jpegQualityBox': GUI.jpegQualityBox.checkState(),
--- a/kindlecomicconverter/KCC_ui.py
+++ b/kindlecomicconverter/KCC_ui.py
@@ -462,6 +462,11 @@ class Ui_mainWindow(object):

        self.gridLayout_2.addWidget(self.jpegQualityBox, 8, 0, 1, 1)

+        self.pdfExtractBox = QCheckBox(self.optionWidget)
+        self.pdfExtractBox.setObjectName(u"pdfExtractBox")
+
+        self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
+

        self.gridLayout.addWidget(self.optionWidget, 5, 0, 1, 2)

@@ -733,6 +738,12 @@ class Ui_mainWindow(object):
 "Higher values are larger and higher quality, and may resolve blank page issues.", None))
 #endif // QT_CONFIG(tooltip)
        self.jpegQualityBox.setText(QCoreApplication.translate("mainWindow", u"Custom JPEG Quality", None))
+#if QT_CONFIG(tooltip)
+        self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
+"\n"
+"Useful for really weird PDFs.", None))
+#endif // QT_CONFIG(tooltip)
+        self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
        self.gammaLabel.setText(QCoreApplication.translate("mainWindow", u"Gamma: Auto", None))
        self.jpegQualityLabel.setText(QCoreApplication.translate("mainWindow", u"JPEG Quality:", None))
    # retranslateUi
--- a/kindlecomicconverter/comic2ebook.py
+++ b/kindlecomicconverter/comic2ebook.py
@@ -48,6 +48,7 @@ from .comicarchive import SEVENZIP, available_archive_tools
 from . import comic2panel
 from . import image
 from . import comicarchive
+from . import pdfjpgextract
 from . import dualmetafix
 from . import metadata
 from . import kindle
@@ -875,6 +876,12 @@ def getWorkFolder(afile, workdir=None):
                os.makedirs(fullPath)
            path = workdir
            sanitizePermissions(path)
+            if options.pdfextract:
+                pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
+                njpg = pdf.extract()
+                if njpg == 0:
+                    raise UserWarning("Failed to extract images from PDF file.")
+                return workdir
            target_height = options.profileData[1][1]
            if options.cropping == 1:
                target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
@@ -1342,6 +1349,8 @@ def makeParser():

    processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
                                    help="Do not modify image and ignore any profile or processing option")
+    processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
+                                    help="Use the legacy PDF image extraction method from KCC 8 and earlier")
    processing_options.add_argument("-u", "--upscale", action="store_true", dest="upscale", default=False,
                                    help="Resize images smaller than device's resolution")
    processing_options.add_argument("-s", "--stretch", action="store_true", dest="stretch", default=False,
--- a/kindlecomicconverter/pdfjpgextract.py
+++ b/kindlecomicconverter/pdfjpgextract.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2012-2014 Ciro Mattia Gonano <ciromattia@gmail.com>
+# Copyright (c) 2013-2019 Pawel Jastrzebski <pawelj@iosphe.re>
+#
+# Based upon the code snippet by Ned Batchelder
+# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
+#
+# Permission to use, copy, modify, and/or distribute this software for
+# any purpose with or without fee is hereby granted, provided that the
+# above copyright notice and this permission notice appear in all
+# copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+# PERFORMANCE OF THIS SOFTWARE.
+#
+
+import os
+
+# skip stray images a few pixels in size in some PDFs
+# typical images are many thousands in length
+# https://github.com/ciromattia/kcc/pull/546
+STRAY_IMAGE_LENGTH_THRESHOLD = 300
+
+
+class PdfJpgExtract:
+    def __init__(self, fname, fullPath):
+        self.fname = fname
+        self.path = fullPath
+
+    def getPath(self):
+        return self.path
+
+    def extract(self):
+        pdf = open(self.fname, "rb").read()
+        startmark = b"\xff\xd8"
+        startfix = 0
+        endmark = b"\xff\xd9"
+        endfix = 2
+        i = 0
+        njpg = 0
+        while True:
+            istream = pdf.find(b"stream", i)
+            if istream < 0:
+                break
+            istart = pdf.find(startmark, istream, istream + 20)
+            if istart < 0:
+                i = istream + 20
+                continue
+            iend = pdf.find(b"endstream", istart)
+            if iend < 0:
+                raise Exception("Didn't find end of stream!")
+            iend = pdf.find(endmark, iend - 20)
+            if iend < 0:
+                raise Exception("Didn't find end of JPG!")
+            istart += startfix
+            iend += endfix
+            i = iend
+
+            if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
+                continue
+
+            jpg = pdf[istart:iend]
+            jpgfile = open(os.path.join(self.path, "jpg%d.jpg" % njpg), "wb")
+            jpgfile.write(jpg)
+            jpgfile.close()
+            njpg += 1
+
+        return njpg