1
0
mirror of https://github.com/ciromattia/kcc synced 2025-12-12 17:26:23 +00:00

Improve pdf support by using mupdf (#983)

* Improve pdf support with mupdf

* parallel page ranges not pages

* fix black blank

* remove full=True

* add TODO

* fix doc close

---------

Co-authored-by: Alex Xu <alexkurosakimh3@gmail.com>
This commit is contained in:
Adrian
2025-07-19 03:45:20 +03:00
committed by GitHub
parent cc2eb9dcf3
commit eb24a400b4
3 changed files with 150 additions and 86 deletions

View File

@@ -32,7 +32,7 @@ from typing import List
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from tempfile import mkdtemp, gettempdir, TemporaryFile
from shutil import move, copytree, rmtree, copyfile
from multiprocessing import Pool
from multiprocessing import Pool, cpu_count
from uuid import uuid4
from natsort import os_sort_keygen, os_sorted
from slugify import slugify as slugify_ext
@@ -41,13 +41,14 @@ from pathlib import Path
from subprocess import STDOUT, PIPE, CalledProcessError
from psutil import virtual_memory, disk_usage
from html import escape as hescape
import pymupdf
import numpy as np
from .shared import getImageFileName, walkSort, walkLevel, sanitizeTrace, subprocess_run
from .comicarchive import SEVENZIP, available_archive_tools
from . import comic2panel
from . import image
from . import comicarchive
from . import pdfjpgextract
from . import dualmetafix
from . import metadata
from . import kindle
@@ -666,6 +667,141 @@ def imgFileProcessing(work):
return str(sys.exc_info()[1]), sanitizeTrace(sys.exc_info()[2])
def render_page(vector):
"""Render a page range of a document.
Notes:
The PyMuPDF document cannot be part of the argument, because that
cannot be pickled. So we are being passed in just its filename.
This is no performance issue, because we are a separate process and
need to open the document anyway.
Any page-specific function can be processed here - rendering is just
an example - text extraction might be another.
The work must however be self-contained: no inter-process communication
or synchronization is possible with this design.
Care must also be taken with which parameters are contained in the
argument, because it will be passed in via pickling by the Pool class.
So any large objects will increase the overall duration.
Args:
vector: a list containing required parameters.
"""
# recreate the arguments
idx = vector[0] # this is the segment number we have to process
cpu = vector[1] # number of CPUs
filename = vector[2] # document filename
output_dir = vector[3]
target_height = vector[4]
try:
with pymupdf.open(filename) as doc: # open the document
num_pages = doc.page_count # get number of pages
# pages per segment: make sure that cpu * seg_size >= num_pages!
seg_size = int(num_pages / cpu + 1)
seg_from = idx * seg_size # our first page number
seg_to = min(seg_from + seg_size, num_pages) # last page number
for i in range(seg_from, seg_to): # work through our page segment
page = doc[i]
mat = target_height / page.rect.height
# TODO: decide colorspace earlier so later color check is cheaper.
pix = page.get_pixmap(matrix=mat, colorspace='RGB', alpha=False)
pix.save(os.path.join(output_dir, "p-%i.png" % i))
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
except Exception as e:
raise UserWarning(f"Error rendering {filename}: {e}")
def extract_page(vector):
"""For pages with single image (and no text). Otherwise it's recommended to use render_page()
Notes:
The PyMuPDF document cannot be part of the argument, because that
cannot be pickled. So we are being passed in just its filename.
This is no performance issue, because we are a separate process and
need to open the document anyway.
Any page-specific function can be processed here - rendering is just
an example - text extraction might be another.
The work must however be self-contained: no inter-process communication
or synchronization is possible with this design.
Care must also be taken with which parameters are contained in the
argument, because it will be passed in via pickling by the Pool class.
So any large objects will increase the overall duration.
Args:
vector: a list containing required parameters.
"""
# recreate the arguments
idx = vector[0] # this is the segment number we have to process
cpu = vector[1] # number of CPUs
filename = vector[2] # document filename
output_dir = vector[3]
try:
with pymupdf.open(filename) as doc: # open the document
num_pages = doc.page_count # get number of pages
# pages per segment: make sure that cpu * seg_size >= num_pages!
seg_size = int(num_pages / cpu + 1)
seg_from = idx * seg_size # our first page number
seg_to = min(seg_from + seg_size, num_pages) # last page number
for i in range(seg_from, seg_to): # work through our page segment
output_path = os.path.join(output_dir, "p-%i.png" % i)
page = doc.load_page(i)
image_list = page.get_images()
if len(image_list) > 1:
raise UserWarning("mupdf_pdf_extract_page_image() function can be used only with single image pages.")
if not image_list:
width, height = int(page.rect.width), int(page.rect.height)
blank_page = Image.new("RGB", (width, height), "white")
blank_page.save(output_path)
xref = image_list[0][0]
pix = pymupdf.Pixmap(doc, xref)
if pix.colorspace is None:
# It's a stencil mask (grayscale image with inverted colors)
mask_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width)
inverted = 255 - mask_array
img = Image.fromarray(inverted, mode="L")
img.save(output_path)
if pix.colorspace.name.startswith("Colorspace(CS_GRAY)"):
# Make sure that an image is just grayscale and not smth like "Colorspace(CS_GRAY) - Separation(DeviceCMYK,Black)"
pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
else:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
if pix.alpha:
pix = pymupdf.Pixmap(pix, alpha=0)
pix.save(output_path)
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
except Exception as e:
raise UserWarning(f"Error exporting {filename}: {e}")
def mupdf_pdf_process_pages_parallel(filename, output_dir, target_height):
render = False
with pymupdf.open(filename) as doc:
for page in doc:
page_text = page.get_text().strip()
if page_text != "":
render = True
break
if len(page.get_images()) > 1:
render = True
break
cpu = cpu_count()
# make vectors of arguments for the processes
vectors = [(i, cpu, filename, output_dir, target_height) for i in range(cpu)]
print("Starting %i processes for '%s'." % (cpu, filename))
try:
with Pool(processes=cpu_count()-1) as pool:
results = pool.map(
render_page if render else extract_page, vectors
)
except Exception as e:
raise UserWarning(f"Error while processing PDF pages: {e}")
def getWorkFolder(afile):
if os.path.isdir(afile):
if disk_usage(gettempdir())[2] < getDirectorySize(afile) * 2.5:
@@ -684,13 +820,19 @@ def getWorkFolder(afile):
if disk_usage(gettempdir())[2] < os.path.getsize(afile) * 2.5:
raise UserWarning("Not enough disk space to perform conversion.")
if afile.lower().endswith('.pdf'):
pdf = pdfjpgextract.PdfJpgExtract(afile)
path, njpg = pdf.extract()
workdir = path
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
path = workdir
sanitizePermissions(path)
if njpg == 0:
target_height = options.profileData[1][1]
if options.cropping == 1:
target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
elif options.cropping == 2:
target_height = target_height + target_height*0.25 #Account for possible margin at the top and bottom with page number
try:
mupdf_pdf_process_pages_parallel(afile, workdir, target_height)
except Exception as e:
rmtree(path, True)
raise UserWarning("Failed to extract images from PDF file.")
raise UserWarning(f"Failed to extract images from PDF file. {e}")
else:
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
try:

View File

@@ -1,79 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2012-2014 Ciro Mattia Gonano <ciromattia@gmail.com>
# Copyright (c) 2013-2019 Pawel Jastrzebski <pawelj@iosphe.re>
#
# Based upon the code snippet by Ned Batchelder
# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted, provided that the
# above copyright notice and this permission notice appear in all
# copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
#
import os
from random import choice
from string import ascii_uppercase, digits
# skip stray images a few pixels in size in some PDFs
# typical images are many thousands in length
# https://github.com/ciromattia/kcc/pull/546
STRAY_IMAGE_LENGTH_THRESHOLD = 300
class PdfJpgExtract:
def __init__(self, fname):
self.fname = fname
self.filename = os.path.splitext(fname)
self.path = self.filename[0] + "-KCC-" + ''.join(choice(ascii_uppercase + digits) for _ in range(3))
def getPath(self):
return self.path
def extract(self):
pdf = open(self.fname, "rb").read()
startmark = b"\xff\xd8"
startfix = 0
endmark = b"\xff\xd9"
endfix = 2
i = 0
njpg = 0
os.makedirs(self.path)
while True:
istream = pdf.find(b"stream", i)
if istream < 0:
break
istart = pdf.find(startmark, istream, istream + 20)
if istart < 0:
i = istream + 20
continue
iend = pdf.find(b"endstream", istart)
if iend < 0:
raise Exception("Didn't find end of stream!")
iend = pdf.find(endmark, iend - 20)
if iend < 0:
raise Exception("Didn't find end of JPG!")
istart += startfix
iend += endfix
i = iend
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
continue
jpg = pdf[istart:iend]
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
jpgfile.write(jpg)
jpgfile.close()
njpg += 1
return self.path, njpg

View File

@@ -9,3 +9,4 @@ mozjpeg-lossless-optimization>=1.2.0
natsort>=8.4.0
distro>=1.8.0
numpy>=1.22.4
PyMuPDF>=1.26.1