1
0
mirror of https://github.com/ciromattia/kcc synced 2025-12-13 01:36:27 +00:00

Improve pdf support by using mupdf (#983)

* Improve pdf support with mupdf

* parallel page ranges not pages

* fix black blank

* remove full=True

* add TODO

* fix doc close

---------

Co-authored-by: Alex Xu <alexkurosakimh3@gmail.com>
This commit is contained in:
Adrian
2025-07-19 03:45:20 +03:00
committed by GitHub
parent cc2eb9dcf3
commit eb24a400b4
3 changed files with 150 additions and 86 deletions

View File

@@ -32,7 +32,7 @@ from typing import List
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from tempfile import mkdtemp, gettempdir, TemporaryFile
from shutil import move, copytree, rmtree, copyfile
from multiprocessing import Pool
from multiprocessing import Pool, cpu_count
from uuid import uuid4
from natsort import os_sort_keygen, os_sorted
from slugify import slugify as slugify_ext
@@ -41,13 +41,14 @@ from pathlib import Path
from subprocess import STDOUT, PIPE, CalledProcessError
from psutil import virtual_memory, disk_usage
from html import escape as hescape
import pymupdf
import numpy as np
from .shared import getImageFileName, walkSort, walkLevel, sanitizeTrace, subprocess_run
from .comicarchive import SEVENZIP, available_archive_tools
from . import comic2panel
from . import image
from . import comicarchive
from . import pdfjpgextract
from . import dualmetafix
from . import metadata
from . import kindle
@@ -666,6 +667,141 @@ def imgFileProcessing(work):
return str(sys.exc_info()[1]), sanitizeTrace(sys.exc_info()[2])
def render_page(vector):
"""Render a page range of a document.
Notes:
The PyMuPDF document cannot be part of the argument, because that
cannot be pickled. So we are being passed in just its filename.
This is no performance issue, because we are a separate process and
need to open the document anyway.
Any page-specific function can be processed here - rendering is just
an example - text extraction might be another.
The work must however be self-contained: no inter-process communication
or synchronization is possible with this design.
Care must also be taken with which parameters are contained in the
argument, because it will be passed in via pickling by the Pool class.
So any large objects will increase the overall duration.
Args:
vector: a list containing required parameters.
"""
# recreate the arguments
idx = vector[0] # this is the segment number we have to process
cpu = vector[1] # number of CPUs
filename = vector[2] # document filename
output_dir = vector[3]
target_height = vector[4]
try:
with pymupdf.open(filename) as doc: # open the document
num_pages = doc.page_count # get number of pages
# pages per segment: make sure that cpu * seg_size >= num_pages!
seg_size = int(num_pages / cpu + 1)
seg_from = idx * seg_size # our first page number
seg_to = min(seg_from + seg_size, num_pages) # last page number
for i in range(seg_from, seg_to): # work through our page segment
page = doc[i]
mat = target_height / page.rect.height
# TODO: decide colorspace earlier so later color check is cheaper.
pix = page.get_pixmap(matrix=mat, colorspace='RGB', alpha=False)
pix.save(os.path.join(output_dir, "p-%i.png" % i))
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
except Exception as e:
raise UserWarning(f"Error rendering {filename}: {e}")
def extract_page(vector):
"""For pages with single image (and no text). Otherwise it's recommended to use render_page()
Notes:
The PyMuPDF document cannot be part of the argument, because that
cannot be pickled. So we are being passed in just its filename.
This is no performance issue, because we are a separate process and
need to open the document anyway.
Any page-specific function can be processed here - rendering is just
an example - text extraction might be another.
The work must however be self-contained: no inter-process communication
or synchronization is possible with this design.
Care must also be taken with which parameters are contained in the
argument, because it will be passed in via pickling by the Pool class.
So any large objects will increase the overall duration.
Args:
vector: a list containing required parameters.
"""
# recreate the arguments
idx = vector[0] # this is the segment number we have to process
cpu = vector[1] # number of CPUs
filename = vector[2] # document filename
output_dir = vector[3]
try:
with pymupdf.open(filename) as doc: # open the document
num_pages = doc.page_count # get number of pages
# pages per segment: make sure that cpu * seg_size >= num_pages!
seg_size = int(num_pages / cpu + 1)
seg_from = idx * seg_size # our first page number
seg_to = min(seg_from + seg_size, num_pages) # last page number
for i in range(seg_from, seg_to): # work through our page segment
output_path = os.path.join(output_dir, "p-%i.png" % i)
page = doc.load_page(i)
image_list = page.get_images()
if len(image_list) > 1:
raise UserWarning("mupdf_pdf_extract_page_image() function can be used only with single image pages.")
if not image_list:
width, height = int(page.rect.width), int(page.rect.height)
blank_page = Image.new("RGB", (width, height), "white")
blank_page.save(output_path)
xref = image_list[0][0]
pix = pymupdf.Pixmap(doc, xref)
if pix.colorspace is None:
# It's a stencil mask (grayscale image with inverted colors)
mask_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width)
inverted = 255 - mask_array
img = Image.fromarray(inverted, mode="L")
img.save(output_path)
if pix.colorspace.name.startswith("Colorspace(CS_GRAY)"):
# Make sure that an image is just grayscale and not smth like "Colorspace(CS_GRAY) - Separation(DeviceCMYK,Black)"
pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
else:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
if pix.alpha:
pix = pymupdf.Pixmap(pix, alpha=0)
pix.save(output_path)
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
except Exception as e:
raise UserWarning(f"Error exporting {filename}: {e}")
def mupdf_pdf_process_pages_parallel(filename, output_dir, target_height):
render = False
with pymupdf.open(filename) as doc:
for page in doc:
page_text = page.get_text().strip()
if page_text != "":
render = True
break
if len(page.get_images()) > 1:
render = True
break
cpu = cpu_count()
# make vectors of arguments for the processes
vectors = [(i, cpu, filename, output_dir, target_height) for i in range(cpu)]
print("Starting %i processes for '%s'." % (cpu, filename))
try:
with Pool(processes=cpu_count()-1) as pool:
results = pool.map(
render_page if render else extract_page, vectors
)
except Exception as e:
raise UserWarning(f"Error while processing PDF pages: {e}")
def getWorkFolder(afile):
if os.path.isdir(afile):
if disk_usage(gettempdir())[2] < getDirectorySize(afile) * 2.5:
@@ -684,13 +820,19 @@ def getWorkFolder(afile):
if disk_usage(gettempdir())[2] < os.path.getsize(afile) * 2.5:
raise UserWarning("Not enough disk space to perform conversion.")
if afile.lower().endswith('.pdf'):
pdf = pdfjpgextract.PdfJpgExtract(afile)
path, njpg = pdf.extract()
workdir = path
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
path = workdir
sanitizePermissions(path)
if njpg == 0:
target_height = options.profileData[1][1]
if options.cropping == 1:
target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
elif options.cropping == 2:
target_height = target_height + target_height*0.25 #Account for possible margin at the top and bottom with page number
try:
mupdf_pdf_process_pages_parallel(afile, workdir, target_height)
except Exception as e:
rmtree(path, True)
raise UserWarning("Failed to extract images from PDF file.")
raise UserWarning(f"Failed to extract images from PDF file. {e}")
else:
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
try: