1
0
mirror of https://github.com/ciromattia/kcc synced 2026-01-25 22:47:28 +00:00

Add legacy pdf image extract option (#1225)

This commit is contained in:
Alex Xu
2026-01-25 13:41:43 -08:00
committed by GitHub
parent f63387cae4
commit 9a4143ce62
6 changed files with 111 additions and 0 deletions

View File

@@ -238,6 +238,7 @@ MAIN:
PROCESSING:
-n, --noprocessing Do not modify image and ignore any profile or processing option
--pdfextract Use legacy PDF image extraction method from KCC 8 and earlier.
-u, --upscale Resize images smaller than device's resolution
-s, --stretch Stretch images to device's resolution
-r SPLITTER, --splitter SPLITTER

View File

@@ -896,6 +896,18 @@ Higher values are larger and higher quality, and may resolve blank page issues.<
</property>
</widget>
</item>
<item row="9" column="0">
<widget class="QCheckBox" name="pdfExtractBox">
<property name="toolTip">
<string>Use the PDF image extraction method from KCC 8 and earlier.
Useful for really weird PDFs.</string>
</property>
<property name="text">
<string>PDF Legacy Extract</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View File

@@ -327,6 +327,8 @@ class WorkerThread(QThread):
options.maximizestrips = True
if GUI.disableProcessingBox.isChecked():
options.noprocessing = True
if GUI.pdfExtractBox.isChecked():
options.pdfextract = True
if GUI.metadataTitleBox.checkState() == Qt.CheckState.PartiallyChecked:
options.metadatatitle = 1
elif GUI.metadataTitleBox.checkState() == Qt.CheckState.Checked:
@@ -1032,6 +1034,7 @@ class KCCGUI(KCC_ui.Ui_mainWindow):
'colorBox': GUI.colorBox.checkState(),
'eraseRainbowBox': GUI.eraseRainbowBox.checkState(),
'disableProcessingBox': GUI.disableProcessingBox.checkState(),
'pdfExtractBox': GUI.pdfExtractBox.checkState(),
'metadataTitleBox': GUI.metadataTitleBox.checkState(),
'mozJpegBox': GUI.mozJpegBox.checkState(),
'jpegQualityBox': GUI.jpegQualityBox.checkState(),

View File

@@ -462,6 +462,11 @@ class Ui_mainWindow(object):
self.gridLayout_2.addWidget(self.jpegQualityBox, 8, 0, 1, 1)
self.pdfExtractBox = QCheckBox(self.optionWidget)
self.pdfExtractBox.setObjectName(u"pdfExtractBox")
self.gridLayout_2.addWidget(self.pdfExtractBox, 9, 0, 1, 1)
self.gridLayout.addWidget(self.optionWidget, 5, 0, 1, 2)
@@ -733,6 +738,12 @@ class Ui_mainWindow(object):
"Higher values are larger and higher quality, and may resolve blank page issues.", None))
#endif // QT_CONFIG(tooltip)
self.jpegQualityBox.setText(QCoreApplication.translate("mainWindow", u"Custom JPEG Quality", None))
#if QT_CONFIG(tooltip)
self.pdfExtractBox.setToolTip(QCoreApplication.translate("mainWindow", u"Use the PDF image extraction method from KCC 8 and earlier.\n"
"\n"
"Useful for really weird PDFs.", None))
#endif // QT_CONFIG(tooltip)
self.pdfExtractBox.setText(QCoreApplication.translate("mainWindow", u"PDF Legacy Extract", None))
self.gammaLabel.setText(QCoreApplication.translate("mainWindow", u"Gamma: Auto", None))
self.jpegQualityLabel.setText(QCoreApplication.translate("mainWindow", u"JPEG Quality:", None))
# retranslateUi

View File

@@ -48,6 +48,7 @@ from .comicarchive import SEVENZIP, available_archive_tools
from . import comic2panel
from . import image
from . import comicarchive
from . import pdfjpgextract
from . import dualmetafix
from . import metadata
from . import kindle
@@ -875,6 +876,12 @@ def getWorkFolder(afile, workdir=None):
os.makedirs(fullPath)
path = workdir
sanitizePermissions(path)
if options.pdfextract:
pdf = pdfjpgextract.PdfJpgExtract(afile, fullPath)
njpg = pdf.extract()
if njpg == 0:
raise UserWarning("Failed to extract images from PDF file.")
return workdir
target_height = options.profileData[1][1]
if options.cropping == 1:
target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
@@ -1342,6 +1349,8 @@ def makeParser():
processing_options.add_argument("-n", "--noprocessing", action="store_true", dest="noprocessing", default=False,
help="Do not modify image and ignore any profile or processing option")
processing_options.add_argument("--pdfextract", action="store_true", dest="pdfextract", default=False,
help="Use the legacy PDF image extraction method from KCC 8 and earlier")
processing_options.add_argument("-u", "--upscale", action="store_true", dest="upscale", default=False,
help="Resize images smaller than device's resolution")
processing_options.add_argument("-s", "--stretch", action="store_true", dest="stretch", default=False,

View File

@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2012-2014 Ciro Mattia Gonano <ciromattia@gmail.com>
# Copyright (c) 2013-2019 Pawel Jastrzebski <pawelj@iosphe.re>
#
# Based upon the code snippet by Ned Batchelder
# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all
# copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
#
import os
# skip stray images a few pixels in size in some PDFs
# typical images are many thousands in length
# https://github.com/ciromattia/kcc/pull/546
STRAY_IMAGE_LENGTH_THRESHOLD = 300
class PdfJpgExtract:
def __init__(self, fname, fullPath):
self.fname = fname
self.path = fullPath
def getPath(self):
return self.path
def extract(self):
pdf = open(self.fname, "rb").read()
startmark = b"\xff\xd8"
startfix = 0
endmark = b"\xff\xd9"
endfix = 2
i = 0
njpg = 0
while True:
istream = pdf.find(b"stream", i)
if istream < 0:
break
istart = pdf.find(startmark, istream, istream + 20)
if istart < 0:
i = istream + 20
continue
iend = pdf.find(b"endstream", istart)
if iend < 0:
raise Exception("Didn't find end of stream!")
iend = pdf.find(endmark, iend - 20)
if iend < 0:
raise Exception("Didn't find end of JPG!")
istart += startfix
iend += endfix
i = iend
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
continue
jpg = pdf[istart:iend]
jpgfile = open(os.path.join(self.path, "jpg%d.jpg" % njpg), "wb")
jpgfile.write(jpg)
jpgfile.close()
njpg += 1
return njpg