diff --git a/README.md b/README.md
index a58bff5..575ffe9 100644
--- a/README.md
+++ b/README.md
@@ -238,6 +238,7 @@ MAIN:
PROCESSING:
-n, --noprocessing Do not modify image and ignore any profile or processing option
+ --pdfextract Use legacy PDF image extraction method from KCC 8 and earlier.
-u, --upscale Resize images smaller than device's resolution
-s, --stretch Stretch images to device's resolution
-r SPLITTER, --splitter SPLITTER
diff --git a/gui/KCC.ui b/gui/KCC.ui
index e2be86d..63dd857 100644
--- a/gui/KCC.ui
+++ b/gui/KCC.ui
@@ -896,6 +896,18 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
+ -
+
+
+ Use the PDF image extraction method from KCC 8 and earlier.
+
+Useful for really weird PDFs.
+
+
+ PDF Legacy Extract
+
+
+
diff --git a/kindlecomicconverter/KCC_gui.py b/kindlecomicconverter/KCC_gui.py
index 464d8fb..46650fa 100644
--- a/kindlecomicconverter/KCC_gui.py
+++ b/kindlecomicconverter/KCC_gui.py
@@ -327,6 +327,8 @@ class WorkerThread(QThread):
options.maximizestrips = True
if GUI.disableProcessingBox.isChecked():
options.noprocessing = True
+ if GUI.pdfExtractBox.isChecked():
+ options.pdfextract = True
if GUI.metadataTitleBox.checkState() == Qt.CheckState.PartiallyChecked:
options.metadatatitle = 1
elif GUI.metadataTitleBox.checkState() == Qt.CheckState.Checked:
@@ -1032,6 +1034,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
'colorBox': GUI.colorBox.checkState(),
'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
'disableProcessingBox': GUI.disableProcessingBox.checkState(),
+ 'pdfExtractBox': GUI.pdfExtractBox.checkState(),
'metadataTitleBox': GUI.metadataTitleBox.checkState(),
'mozJpegBox': GUI.mozJpegBox.checkState(),
'jpegQualityBox': GUI.jpegQualityBox.checkState(),
diff --git a/kindlecomicconverter/KCC_ui.py b/kindlecomicconverter/KCC_ui.py
index a9c7ecc..3574d15 100644
--- a/kindlecomicconverter/KCC_ui.py
+++ b/kindlecomicconverter/KCC_ui.py
@@ -462,6 +462,11 @@ class Ui_mainWindow(object):
self.gridLayout_2.addWidget(self.jpegQualityBox, 8, 0, 1, 1)
+ self.pdfExtractBox = QCheckBox(self.optionWidget)
+ self.pdfExtractBox.setObjectName(u"pdfExtractBox")
+
+ self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
+
self.gridLayout.addWidget(self.optionWidget, 5, 0, 1, 2)
@@ -733,6 +738,12 @@ class Ui_mainWindow(object):
"Higher values are larger and higher quality, and may resolve blank page issues.", None))
#endif // QT_CONFIG(tooltip)
self.jpegQualityBox.setText(QCoreApplication.translate("mainWindow", u"Custom JPEG Quality", None))
+#if QT_CONFIG(tooltip)
+ self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
+"\n"
+"Useful for really weird PDFs.", None))
+#endif // QT_CONFIG(tooltip)
+ self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
self.gammaLabel.setText(QCoreApplication.translate("mainWindow", u"Gamma: Auto", None))
self.jpegQualityLabel.setText(QCoreApplication.translate("mainWindow", u"JPEG Quality:", None))
# retranslateUi
diff --git a/kindlecomicconverter/comic2ebook.py b/kindlecomicconverter/comic2ebook.py
index 855e05a..11da62e 100755
--- a/kindlecomicconverter/comic2ebook.py
+++ b/kindlecomicconverter/comic2ebook.py
@@ -48,6 +48,7 @@ from .comicarchive import SEVENZIP, available_archive_tools
from . import comic2panel
from . import image
from . import comicarchive
+from . import pdfjpgextract
from . import dualmetafix
from . import metadata
from . import kindle
@@ -875,6 +876,12 @@ def getWorkFolder(afile, workdir=None):
os.makedirs(fullPath)
path = workdir
sanitizePermissions(path)
+ if options.pdfextract:
+ pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
+ njpg = pdf.extract()
+ if njpg == 0:
+ raise UserWarning("Failed to extract images from PDF file.")
+ return workdir
target_height = options.profileData[1][1]
if options.cropping == 1:
target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
@@ -1342,6 +1349,8 @@ def makeParser():
processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
help="Do not modify image and ignore any profile or processing option")
+ processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
+ help="Use the legacy PDF image extraction method from KCC 8 and earlier")
processing_options.add_argument("-u", "--upscale", action="store_true", dest="upscale", default=False,
help="Resize images smaller than device's resolution")
processing_options.add_argument("-s", "--stretch", action="store_true", dest="stretch", default=False,
diff --git a/kindlecomicconverter/pdfjpgextract.py b/kindlecomicconverter/pdfjpgextract.py
new file mode 100644
index 0000000..751a68e
--- /dev/null
+++ b/kindlecomicconverter/pdfjpgextract.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2012-2014 Ciro Mattia Gonano
+# Copyright (c) 2013-2019 Pawel Jastrzebski
+#
+# Based upon the code snippet by Ned Batchelder
+# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
+#
+# Permission to use, copy, modify, and/or distribute this software for
+# any purpose with or without fee is hereby granted, provided that the
+# above copyright notice and this permission notice appear in all
+# copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+# PERFORMANCE OF THIS SOFTWARE.
+#
+
+import os
+
+# skip stray images a few pixels in size in some PDFs
+# typical images are many thousands in length
+# https://github.com/ciromattia/kcc/pull/546
+STRAY_IMAGE_LENGTH_THRESHOLD = 300
+
+
+class PdfJpgExtract:
+ def __init__(self, fname, fullPath):
+ self.fname = fname
+ self.path = fullPath
+
+ def getPath(self):
+ return self.path
+
+ def extract(self):
+ pdf = open(self.fname, "rb").read()
+ startmark = b"\xff\xd8"
+ startfix = 0
+ endmark = b"\xff\xd9"
+ endfix = 2
+ i = 0
+ njpg = 0
+ while True:
+ istream = pdf.find(b"stream", i)
+ if istream < 0:
+ break
+ istart = pdf.find(startmark, istream, istream + 20)
+ if istart < 0:
+ i = istream + 20
+ continue
+ iend = pdf.find(b"endstream", istart)
+ if iend < 0:
+ raise Exception("Didn't find end of stream!")
+ iend = pdf.find(endmark, iend - 20)
+ if iend < 0:
+ raise Exception("Didn't find end of JPG!")
+ istart += startfix
+ iend += endfix
+ i = iend
+
+ if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
+ continue
+
+ jpg = pdf[istart:iend]
+ jpgfile = open(os.path.join(self.path, "jpg%d.jpg" % njpg), "wb")
+ jpgfile.write(jpg)
+ jpgfile.close()
+ njpg += 1
+
+ return njpg