Improve pdf support by using mupdf (#983)

* Improve pdf support with mupdf * parallel page ranges not pages * fix black blank * remove full=True * add TODO * fix doc close --------- Co-authored-by: Alex Xu <alexkurosakimh3@gmail.com>
2026-01-27 15:37:28 +00:00 · 2025-07-19 03:45:20 +03:00
parent cc2eb9dcf3
commit eb24a400b4
3 changed files with 150 additions and 86 deletions
--- a/kindlecomicconverter/comic2ebook.py
+++ b/kindlecomicconverter/comic2ebook.py
@@ -32,7 +32,7 @@ from typing import List
 from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
 from tempfile import mkdtemp, gettempdir, TemporaryFile
 from shutil import move, copytree, rmtree, copyfile
-from multiprocessing import Pool
+from multiprocessing import Pool, cpu_count
 from uuid import uuid4
 from natsort import os_sort_keygen, os_sorted
 from slugify import slugify as slugify_ext
@@ -41,13 +41,14 @@ from pathlib import Path
 from subprocess import STDOUT, PIPE, CalledProcessError
 from psutil import virtual_memory, disk_usage
 from html import escape as hescape
+import pymupdf
+import numpy as np

 from .shared import getImageFileName, walkSort, walkLevel, sanitizeTrace, subprocess_run
 from .comicarchive import SEVENZIP, available_archive_tools
 from . import comic2panel
 from . import image
 from . import comicarchive
-from . import pdfjpgextract
 from . import dualmetafix
 from . import metadata
 from . import kindle
@@ -666,6 +667,141 @@ def imgFileProcessing(work):
        return str(sys.exc_info()[1]), sanitizeTrace(sys.exc_info()[2])


+def render_page(vector):
+    """Render a page range of a document.
+
+    Notes:
+        The PyMuPDF document cannot be part of the argument, because that
+        cannot be pickled. So we are being passed in just its filename.
+        This is no performance issue, because we are a separate process and
+        need to open the document anyway.
+        Any page-specific function can be processed here - rendering is just
+        an example - text extraction might be another.
+        The work must however be self-contained: no inter-process communication
+        or synchronization is possible with this design.
+        Care must also be taken with which parameters are contained in the
+        argument, because it will be passed in via pickling by the Pool class.
+        So any large objects will increase the overall duration.
+    Args:
+        vector: a list containing required parameters.
+    """
+    # recreate the arguments
+    idx = vector[0]  # this is the segment number we have to process
+    cpu = vector[1]  # number of CPUs
+    filename = vector[2]  # document filename
+    output_dir = vector[3]
+    target_height = vector[4]
+    try:
+        with pymupdf.open(filename) as doc:  # open the document
+            num_pages = doc.page_count  # get number of pages
+
+            # pages per segment: make sure that cpu * seg_size >= num_pages!
+            seg_size = int(num_pages / cpu + 1)
+            seg_from = idx * seg_size  # our first page number
+            seg_to = min(seg_from + seg_size, num_pages)  # last page number
+
+            for i in range(seg_from, seg_to):  # work through our page segment
+                page = doc[i]
+                mat = target_height / page.rect.height
+                # TODO: decide colorspace earlier so later color check is cheaper.
+                pix = page.get_pixmap(matrix=mat, colorspace='RGB', alpha=False)
+                pix.save(os.path.join(output_dir, "p-%i.png" % i))
+            print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
+    except Exception as e:
+        raise UserWarning(f"Error rendering {filename}: {e}")
+
+
+def extract_page(vector):
+    """For pages with single image (and no text). Otherwise it's recommended to use render_page()
+
+    Notes:
+        The PyMuPDF document cannot be part of the argument, because that
+        cannot be pickled. So we are being passed in just its filename.
+        This is no performance issue, because we are a separate process and
+        need to open the document anyway.
+        Any page-specific function can be processed here - rendering is just
+        an example - text extraction might be another.
+        The work must however be self-contained: no inter-process communication
+        or synchronization is possible with this design.
+        Care must also be taken with which parameters are contained in the
+        argument, because it will be passed in via pickling by the Pool class.
+        So any large objects will increase the overall duration.
+    Args:
+        vector: a list containing required parameters.
+    """
+    # recreate the arguments
+    idx = vector[0]  # this is the segment number we have to process
+    cpu = vector[1]  # number of CPUs
+    filename = vector[2]  # document filename
+    output_dir = vector[3]
+
+    try:
+        with pymupdf.open(filename) as doc: # open the document
+            num_pages = doc.page_count  # get number of pages
+
+            # pages per segment: make sure that cpu * seg_size >= num_pages!
+            seg_size = int(num_pages / cpu + 1)
+            seg_from = idx * seg_size  # our first page number
+            seg_to = min(seg_from + seg_size, num_pages)  # last page number
+
+            for i in range(seg_from, seg_to):  # work through our page segment
+                output_path = os.path.join(output_dir, "p-%i.png" % i)
+                page = doc.load_page(i)
+                image_list = page.get_images()
+                if len(image_list) > 1:
+                    raise UserWarning("mupdf_pdf_extract_page_image() function can be used only with single image pages.")
+                if not image_list:
+                    width, height = int(page.rect.width), int(page.rect.height)
+                    blank_page = Image.new("RGB", (width, height), "white")
+                    blank_page.save(output_path)
+                xref = image_list[0][0]
+                pix = pymupdf.Pixmap(doc, xref)
+                if pix.colorspace is None:
+                    # It's a stencil mask (grayscale image with inverted colors)
+                    mask_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width)
+                    inverted = 255 - mask_array
+                    img = Image.fromarray(inverted, mode="L")
+                    img.save(output_path)
+                if pix.colorspace.name.startswith("Colorspace(CS_GRAY)"):
+                    # Make sure that an image is just grayscale and not smth like "Colorspace(CS_GRAY) - Separation(DeviceCMYK,Black)"
+                    pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
+                else:
+                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                if pix.alpha: 
+                    pix = pymupdf.Pixmap(pix, alpha=0)
+                pix.save(output_path)
+            print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
+    except Exception as e:
+        raise UserWarning(f"Error exporting {filename}: {e}")
+
+
+def mupdf_pdf_process_pages_parallel(filename, output_dir, target_height):
+    render = False
+    with pymupdf.open(filename) as doc:
+        for page in doc:
+            page_text = page.get_text().strip()
+            if page_text != "":
+                render = True
+                break
+            if len(page.get_images()) > 1:
+                render = True
+                break
+
+    cpu = cpu_count()
+
+    # make vectors of arguments for the processes
+    vectors = [(i, cpu, filename, output_dir, target_height) for i in range(cpu)]
+    print("Starting %i processes for '%s'." % (cpu, filename))
+
+    try:
+        with Pool(processes=cpu_count()-1) as pool:
+            results = pool.map(
+                render_page if render else extract_page, vectors
+            )
+    except Exception as e:
+        raise UserWarning(f"Error while processing PDF pages: {e}")
+
+
 def getWorkFolder(afile):
    if os.path.isdir(afile):
        if disk_usage(gettempdir())[2] < getDirectorySize(afile) * 2.5:
@@ -684,13 +820,19 @@ def getWorkFolder(afile):
        if disk_usage(gettempdir())[2] < os.path.getsize(afile) * 2.5:
            raise UserWarning("Not enough disk space to perform conversion.")
        if afile.lower().endswith('.pdf'):
-            pdf = pdfjpgextract.PdfJpgExtract(afile)
-            path, njpg = pdf.extract()
-            workdir = path
+            workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
+            path = workdir
            sanitizePermissions(path)
-            if njpg == 0:
+            target_height = options.profileData[1][1]
+            if options.cropping == 1:
+                target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
+            elif options.cropping == 2:
+                target_height = target_height + target_height*0.25 #Account for possible margin at the top and bottom with page number
+            try:
+                mupdf_pdf_process_pages_parallel(afile, workdir, target_height)
+            except Exception as e:
                rmtree(path, True)
-                raise UserWarning("Failed to extract images from PDF file.")
+                raise UserWarning(f"Failed to extract images from PDF file. {e}")
        else:
            workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
            try:
--- a/kindlecomicconverter/pdfjpgextract.py
+++ b/kindlecomicconverter/pdfjpgextract.py
@@ -1,79 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Copyright (c) 2012-2014 Ciro Mattia Gonano <ciromattia@gmail.com>
-# Copyright (c) 2013-2019 Pawel Jastrzebski <pawelj@iosphe.re>
-#
-# Based upon the code snippet by Ned Batchelder
-# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
-#
-# Permission to use, copy, modify, and/or distribute this software for
-# any purpose with or without fee is hereby granted, provided that the
-# above copyright notice and this permission notice appear in all
-# copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
-# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
-# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
-# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
-# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
-# PERFORMANCE OF THIS SOFTWARE.
-#
-
-import os
-from random import choice
-from string import ascii_uppercase, digits
-
-# skip stray images a few pixels in size in some PDFs
-# typical images are many thousands in length
-# https://github.com/ciromattia/kcc/pull/546
-STRAY_IMAGE_LENGTH_THRESHOLD = 300
-
-
-class PdfJpgExtract:
-    def __init__(self, fname):
-        self.fname = fname
-        self.filename = os.path.splitext(fname)
-        self.path = self.filename[0] + "-KCC-" + ''.join(choice(ascii_uppercase + digits) for _ in range(3))
-
-    def getPath(self):
-        return self.path
-
-    def extract(self):
-        pdf = open(self.fname, "rb").read()
-        startmark = b"\xff\xd8"
-        startfix = 0
-        endmark = b"\xff\xd9"
-        endfix = 2
-        i = 0
-        njpg = 0
-        os.makedirs(self.path)
-        while True:
-            istream = pdf.find(b"stream", i)
-            if istream < 0:
-                break
-            istart = pdf.find(startmark, istream, istream + 20)
-            if istart < 0:
-                i = istream + 20
-                continue
-            iend = pdf.find(b"endstream", istart)
-            if iend < 0:
-                raise Exception("Didn't find end of stream!")
-            iend = pdf.find(endmark, iend - 20)
-            if iend < 0:
-                raise Exception("Didn't find end of JPG!")
-            istart += startfix
-            iend += endfix
-            i = iend
-
-            if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
-                continue
-
-            jpg = pdf[istart:iend]
-            jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
-            jpgfile.write(jpg)
-            jpgfile.close()
-            njpg += 1
-
-        return self.path, njpg
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ mozjpeg-lossless-optimization>=1.2.0
 natsort>=8.4.0
 distro>=1.8.0
 numpy>=1.22.4
+PyMuPDF>=1.26.1