diff --git a/README.md b/README.md index a58bff5..575ffe9 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,7 @@ MAIN: PROCESSING: -n, --noprocessing Do not modify image and ignore any profile or processing option + --pdfextract Use legacy PDF image extraction method from KCC 8 and earlier. -u, --upscale Resize images smaller than device's resolution -s, --stretch Stretch images to device's resolution -r SPLITTER, --splitter SPLITTER diff --git a/gui/KCC.ui b/gui/KCC.ui index e2be86d..63dd857 100644 --- a/gui/KCC.ui +++ b/gui/KCC.ui @@ -896,6 +896,18 @@ Higher values are larger and higher quality, and may resolve blank page issues.< + + + + Use the PDF image extraction method from KCC 8 and earlier. + +Useful for really weird PDFs. + + + PDF Legacy Extract + + + diff --git a/kindlecomicconverter/KCC_gui.py b/kindlecomicconverter/KCC_gui.py index 464d8fb..46650fa 100644 --- a/kindlecomicconverter/KCC_gui.py +++ b/kindlecomicconverter/KCC_gui.py @@ -327,6 +327,8 @@ class WorkerThread(QThread): options.maximizestrips = True if GUI.disableProcessingBox.isChecked(): options.noprocessing = True + if GUI.pdfExtractBox.isChecked(): + options.pdfextract = True if GUI.metadataTitleBox.checkState() == Qt.CheckState.PartiallyChecked: options.metadatatitle = 1 elif GUI.metadataTitleBox.checkState() == Qt.CheckState.Checked: @@ -1032,6 +1034,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow): 'colorBox': GUI.colorBox.checkState(), 'eraseRainbowBox': GUI.eraseRainbowBox.checkState(), 'disableProcessingBox': GUI.disableProcessingBox.checkState(), + 'pdfExtractBox': GUI.pdfExtractBox.checkState(), 'metadataTitleBox': GUI.metadataTitleBox.checkState(), 'mozJpegBox': GUI.mozJpegBox.checkState(), 'jpegQualityBox': GUI.jpegQualityBox.checkState(), diff --git a/kindlecomicconverter/KCC_ui.py b/kindlecomicconverter/KCC_ui.py index a9c7ecc..3574d15 100644 --- a/kindlecomicconverter/KCC_ui.py +++ b/kindlecomicconverter/KCC_ui.py @@ -462,6 +462,11 @@ class Ui_mainWindow(object): self.gridLayout_2.addWidget(self.jpegQualityBox, 8, 0, 1, 1) + self.pdfExtractBox = QCheckBox(self.optionWidget) + self.pdfExtractBox.setObjectName(u"pdfExtractBox") + + self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1) + self.gridLayout.addWidget(self.optionWidget, 5, 0, 1, 2) @@ -733,6 +738,12 @@ class Ui_mainWindow(object): "Higher values are larger and higher quality, and may resolve blank page issues.", None)) #endif // QT_CONFIG(tooltip) self.jpegQualityBox.setText(QCoreApplication.translate("mainWindow", u"Custom JPEG Quality", None)) +#if QT_CONFIG(tooltip) + self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n" +"\n" +"Useful for really weird PDFs.", None)) +#endif // QT_CONFIG(tooltip) + self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None)) self.gammaLabel.setText(QCoreApplication.translate("mainWindow", u"Gamma: Auto", None)) self.jpegQualityLabel.setText(QCoreApplication.translate("mainWindow", u"JPEG Quality:", None)) # retranslateUi diff --git a/kindlecomicconverter/comic2ebook.py b/kindlecomicconverter/comic2ebook.py index 855e05a..11da62e 100755 --- a/kindlecomicconverter/comic2ebook.py +++ b/kindlecomicconverter/comic2ebook.py @@ -48,6 +48,7 @@ from .comicarchive import SEVENZIP, available_archive_tools from . import comic2panel from . import image from . import comicarchive +from . import pdfjpgextract from . import dualmetafix from . import metadata from . import kindle @@ -875,6 +876,12 @@ def getWorkFolder(afile, workdir=None): os.makedirs(fullPath) path = workdir sanitizePermissions(path) + if options.pdfextract: + pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath) + njpg = pdf.extract() + if njpg == 0: + raise UserWarning("Failed to extract images from PDF file.") + return workdir target_height = options.profileData[1][1] if options.cropping == 1: target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom @@ -1342,6 +1349,8 @@ def makeParser(): processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False, help="Do not modify image and ignore any profile or processing option") + processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False, + help="Use the legacy PDF image extraction method from KCC 8 and earlier") processing_options.add_argument("-u", "--upscale", action="store_true", dest="upscale", default=False, help="Resize images smaller than device's resolution") processing_options.add_argument("-s", "--stretch", action="store_true", dest="stretch", default=False, diff --git a/kindlecomicconverter/pdfjpgextract.py b/kindlecomicconverter/pdfjpgextract.py new file mode 100644 index 0000000..751a68e --- /dev/null +++ b/kindlecomicconverter/pdfjpgextract.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2012-2014 Ciro Mattia Gonano +# Copyright (c) 2013-2019 Pawel Jastrzebski +# +# Based upon the code snippet by Ned Batchelder +# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html) +# +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all +# copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL +# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA +# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. +# + +import os + +# skip stray images a few pixels in size in some PDFs +# typical images are many thousands in length +# https://github.com/ciromattia/kcc/pull/546 +STRAY_IMAGE_LENGTH_THRESHOLD = 300 + + +class PdfJpgExtract: + def __init__(self, fname, fullPath): + self.fname = fname + self.path = fullPath + + def getPath(self): + return self.path + + def extract(self): + pdf = open(self.fname, "rb").read() + startmark = b"\xff\xd8" + startfix = 0 + endmark = b"\xff\xd9" + endfix = 2 + i = 0 + njpg = 0 + while True: + istream = pdf.find(b"stream", i) + if istream < 0: + break + istart = pdf.find(startmark, istream, istream + 20) + if istart < 0: + i = istream + 20 + continue + iend = pdf.find(b"endstream", istart) + if iend < 0: + raise Exception("Didn't find end of stream!") + iend = pdf.find(endmark, iend - 20) + if iend < 0: + raise Exception("Didn't find end of JPG!") + istart += startfix + iend += endfix + i = iend + + if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD: + continue + + jpg = pdf[istart:iend] + jpgfile = open(os.path.join(self.path, "jpg%d.jpg" % njpg), "wb") + jpgfile.write(jpg) + jpgfile.close() + njpg += 1 + + return njpg