mirror of
https://github.com/ciromattia/kcc
synced 2025-12-12 17:26:23 +00:00
Improve pdf support by using mupdf (#983)
* Improve pdf support with mupdf * parallel page ranges not pages * fix black blank * remove full=True * add TODO * fix doc close --------- Co-authored-by: Alex Xu <alexkurosakimh3@gmail.com>
This commit is contained in:
@@ -32,7 +32,7 @@ from typing import List
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
from tempfile import mkdtemp, gettempdir, TemporaryFile
|
||||
from shutil import move, copytree, rmtree, copyfile
|
||||
from multiprocessing import Pool
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from uuid import uuid4
|
||||
from natsort import os_sort_keygen, os_sorted
|
||||
from slugify import slugify as slugify_ext
|
||||
@@ -41,13 +41,14 @@ from pathlib import Path
|
||||
from subprocess import STDOUT, PIPE, CalledProcessError
|
||||
from psutil import virtual_memory, disk_usage
|
||||
from html import escape as hescape
|
||||
import pymupdf
|
||||
import numpy as np
|
||||
|
||||
from .shared import getImageFileName, walkSort, walkLevel, sanitizeTrace, subprocess_run
|
||||
from .comicarchive import SEVENZIP, available_archive_tools
|
||||
from . import comic2panel
|
||||
from . import image
|
||||
from . import comicarchive
|
||||
from . import pdfjpgextract
|
||||
from . import dualmetafix
|
||||
from . import metadata
|
||||
from . import kindle
|
||||
@@ -666,6 +667,141 @@ def imgFileProcessing(work):
|
||||
return str(sys.exc_info()[1]), sanitizeTrace(sys.exc_info()[2])
|
||||
|
||||
|
||||
def render_page(vector):
|
||||
"""Render a page range of a document.
|
||||
|
||||
Notes:
|
||||
The PyMuPDF document cannot be part of the argument, because that
|
||||
cannot be pickled. So we are being passed in just its filename.
|
||||
This is no performance issue, because we are a separate process and
|
||||
need to open the document anyway.
|
||||
Any page-specific function can be processed here - rendering is just
|
||||
an example - text extraction might be another.
|
||||
The work must however be self-contained: no inter-process communication
|
||||
or synchronization is possible with this design.
|
||||
Care must also be taken with which parameters are contained in the
|
||||
argument, because it will be passed in via pickling by the Pool class.
|
||||
So any large objects will increase the overall duration.
|
||||
Args:
|
||||
vector: a list containing required parameters.
|
||||
"""
|
||||
# recreate the arguments
|
||||
idx = vector[0] # this is the segment number we have to process
|
||||
cpu = vector[1] # number of CPUs
|
||||
filename = vector[2] # document filename
|
||||
output_dir = vector[3]
|
||||
target_height = vector[4]
|
||||
try:
|
||||
with pymupdf.open(filename) as doc: # open the document
|
||||
num_pages = doc.page_count # get number of pages
|
||||
|
||||
# pages per segment: make sure that cpu * seg_size >= num_pages!
|
||||
seg_size = int(num_pages / cpu + 1)
|
||||
seg_from = idx * seg_size # our first page number
|
||||
seg_to = min(seg_from + seg_size, num_pages) # last page number
|
||||
|
||||
for i in range(seg_from, seg_to): # work through our page segment
|
||||
page = doc[i]
|
||||
mat = target_height / page.rect.height
|
||||
# TODO: decide colorspace earlier so later color check is cheaper.
|
||||
pix = page.get_pixmap(matrix=mat, colorspace='RGB', alpha=False)
|
||||
pix.save(os.path.join(output_dir, "p-%i.png" % i))
|
||||
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
|
||||
except Exception as e:
|
||||
raise UserWarning(f"Error rendering {filename}: {e}")
|
||||
|
||||
|
||||
def extract_page(vector):
|
||||
"""For pages with single image (and no text). Otherwise it's recommended to use render_page()
|
||||
|
||||
Notes:
|
||||
The PyMuPDF document cannot be part of the argument, because that
|
||||
cannot be pickled. So we are being passed in just its filename.
|
||||
This is no performance issue, because we are a separate process and
|
||||
need to open the document anyway.
|
||||
Any page-specific function can be processed here - rendering is just
|
||||
an example - text extraction might be another.
|
||||
The work must however be self-contained: no inter-process communication
|
||||
or synchronization is possible with this design.
|
||||
Care must also be taken with which parameters are contained in the
|
||||
argument, because it will be passed in via pickling by the Pool class.
|
||||
So any large objects will increase the overall duration.
|
||||
Args:
|
||||
vector: a list containing required parameters.
|
||||
"""
|
||||
# recreate the arguments
|
||||
idx = vector[0] # this is the segment number we have to process
|
||||
cpu = vector[1] # number of CPUs
|
||||
filename = vector[2] # document filename
|
||||
output_dir = vector[3]
|
||||
|
||||
try:
|
||||
with pymupdf.open(filename) as doc: # open the document
|
||||
num_pages = doc.page_count # get number of pages
|
||||
|
||||
# pages per segment: make sure that cpu * seg_size >= num_pages!
|
||||
seg_size = int(num_pages / cpu + 1)
|
||||
seg_from = idx * seg_size # our first page number
|
||||
seg_to = min(seg_from + seg_size, num_pages) # last page number
|
||||
|
||||
for i in range(seg_from, seg_to): # work through our page segment
|
||||
output_path = os.path.join(output_dir, "p-%i.png" % i)
|
||||
page = doc.load_page(i)
|
||||
image_list = page.get_images()
|
||||
if len(image_list) > 1:
|
||||
raise UserWarning("mupdf_pdf_extract_page_image() function can be used only with single image pages.")
|
||||
if not image_list:
|
||||
width, height = int(page.rect.width), int(page.rect.height)
|
||||
blank_page = Image.new("RGB", (width, height), "white")
|
||||
blank_page.save(output_path)
|
||||
xref = image_list[0][0]
|
||||
pix = pymupdf.Pixmap(doc, xref)
|
||||
if pix.colorspace is None:
|
||||
# It's a stencil mask (grayscale image with inverted colors)
|
||||
mask_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width)
|
||||
inverted = 255 - mask_array
|
||||
img = Image.fromarray(inverted, mode="L")
|
||||
img.save(output_path)
|
||||
if pix.colorspace.name.startswith("Colorspace(CS_GRAY)"):
|
||||
# Make sure that an image is just grayscale and not smth like "Colorspace(CS_GRAY) - Separation(DeviceCMYK,Black)"
|
||||
pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
|
||||
else:
|
||||
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
||||
if pix.alpha:
|
||||
pix = pymupdf.Pixmap(pix, alpha=0)
|
||||
pix.save(output_path)
|
||||
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
|
||||
except Exception as e:
|
||||
raise UserWarning(f"Error exporting {filename}: {e}")
|
||||
|
||||
|
||||
def mupdf_pdf_process_pages_parallel(filename, output_dir, target_height):
|
||||
render = False
|
||||
with pymupdf.open(filename) as doc:
|
||||
for page in doc:
|
||||
page_text = page.get_text().strip()
|
||||
if page_text != "":
|
||||
render = True
|
||||
break
|
||||
if len(page.get_images()) > 1:
|
||||
render = True
|
||||
break
|
||||
|
||||
cpu = cpu_count()
|
||||
|
||||
# make vectors of arguments for the processes
|
||||
vectors = [(i, cpu, filename, output_dir, target_height) for i in range(cpu)]
|
||||
print("Starting %i processes for '%s'." % (cpu, filename))
|
||||
|
||||
try:
|
||||
with Pool(processes=cpu_count()-1) as pool:
|
||||
results = pool.map(
|
||||
render_page if render else extract_page, vectors
|
||||
)
|
||||
except Exception as e:
|
||||
raise UserWarning(f"Error while processing PDF pages: {e}")
|
||||
|
||||
|
||||
def getWorkFolder(afile):
|
||||
if os.path.isdir(afile):
|
||||
if disk_usage(gettempdir())[2] < getDirectorySize(afile) * 2.5:
|
||||
@@ -684,13 +820,19 @@ def getWorkFolder(afile):
|
||||
if disk_usage(gettempdir())[2] < os.path.getsize(afile) * 2.5:
|
||||
raise UserWarning("Not enough disk space to perform conversion.")
|
||||
if afile.lower().endswith('.pdf'):
|
||||
pdf = pdfjpgextract.PdfJpgExtract(afile)
|
||||
path, njpg = pdf.extract()
|
||||
workdir = path
|
||||
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
|
||||
path = workdir
|
||||
sanitizePermissions(path)
|
||||
if njpg == 0:
|
||||
target_height = options.profileData[1][1]
|
||||
if options.cropping == 1:
|
||||
target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
|
||||
elif options.cropping == 2:
|
||||
target_height = target_height + target_height*0.25 #Account for possible margin at the top and bottom with page number
|
||||
try:
|
||||
mupdf_pdf_process_pages_parallel(afile, workdir, target_height)
|
||||
except Exception as e:
|
||||
rmtree(path, True)
|
||||
raise UserWarning("Failed to extract images from PDF file.")
|
||||
raise UserWarning(f"Failed to extract images from PDF file. {e}")
|
||||
else:
|
||||
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
|
||||
try:
|
||||
|
||||
@@ -1,79 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright (c) 2012-2014 Ciro Mattia Gonano <ciromattia@gmail.com>
|
||||
# Copyright (c) 2013-2019 Pawel Jastrzebski <pawelj@iosphe.re>
|
||||
#
|
||||
# Based upon the code snippet by Ned Batchelder
|
||||
# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
|
||||
#
|
||||
# Permission to use, copy, modify, and/or distribute this software for
|
||||
# any purpose with or without fee is hereby granted, provided that the
|
||||
# above copyright notice and this permission notice appear in all
|
||||
# copies.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
||||
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
||||
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
|
||||
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
|
||||
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
# PERFORMANCE OF THIS SOFTWARE.
|
||||
#
|
||||
|
||||
import os
|
||||
from random import choice
|
||||
from string import ascii_uppercase, digits
|
||||
|
||||
# skip stray images a few pixels in size in some PDFs
|
||||
# typical images are many thousands in length
|
||||
# https://github.com/ciromattia/kcc/pull/546
|
||||
STRAY_IMAGE_LENGTH_THRESHOLD = 300
|
||||
|
||||
|
||||
class PdfJpgExtract:
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
self.filename = os.path.splitext(fname)
|
||||
self.path = self.filename[0] + "-KCC-" + ''.join(choice(ascii_uppercase + digits) for _ in range(3))
|
||||
|
||||
def getPath(self):
|
||||
return self.path
|
||||
|
||||
def extract(self):
|
||||
pdf = open(self.fname, "rb").read()
|
||||
startmark = b"\xff\xd8"
|
||||
startfix = 0
|
||||
endmark = b"\xff\xd9"
|
||||
endfix = 2
|
||||
i = 0
|
||||
njpg = 0
|
||||
os.makedirs(self.path)
|
||||
while True:
|
||||
istream = pdf.find(b"stream", i)
|
||||
if istream < 0:
|
||||
break
|
||||
istart = pdf.find(startmark, istream, istream + 20)
|
||||
if istart < 0:
|
||||
i = istream + 20
|
||||
continue
|
||||
iend = pdf.find(b"endstream", istart)
|
||||
if iend < 0:
|
||||
raise Exception("Didn't find end of stream!")
|
||||
iend = pdf.find(endmark, iend - 20)
|
||||
if iend < 0:
|
||||
raise Exception("Didn't find end of JPG!")
|
||||
istart += startfix
|
||||
iend += endfix
|
||||
i = iend
|
||||
|
||||
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
|
||||
continue
|
||||
|
||||
jpg = pdf[istart:iend]
|
||||
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
|
||||
jpgfile.write(jpg)
|
||||
jpgfile.close()
|
||||
njpg += 1
|
||||
|
||||
return self.path, njpg
|
||||
@@ -9,3 +9,4 @@ mozjpeg-lossless-optimization>=1.2.0
|
||||
natsort>=8.4.0
|
||||
distro>=1.8.0
|
||||
numpy>=1.22.4
|
||||
PyMuPDF>=1.26.1
|
||||
|
||||
Reference in New Issue
Block a user