mirror of
https://github.com/ciromattia/kcc
synced 2025-12-15 10:46:40 +00:00
Improve pdf support by using mupdf (#983)
* Improve pdf support with mupdf * parallel page ranges not pages * fix black blank * remove full=True * add TODO * fix doc close --------- Co-authored-by: Alex Xu <alexkurosakimh3@gmail.com>
This commit is contained in:
@@ -32,7 +32,7 @@ from typing import List
|
|||||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||||
from tempfile import mkdtemp, gettempdir, TemporaryFile
|
from tempfile import mkdtemp, gettempdir, TemporaryFile
|
||||||
from shutil import move, copytree, rmtree, copyfile
|
from shutil import move, copytree, rmtree, copyfile
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool, cpu_count
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from natsort import os_sort_keygen, os_sorted
|
from natsort import os_sort_keygen, os_sorted
|
||||||
from slugify import slugify as slugify_ext
|
from slugify import slugify as slugify_ext
|
||||||
@@ -41,13 +41,14 @@ from pathlib import Path
|
|||||||
from subprocess import STDOUT, PIPE, CalledProcessError
|
from subprocess import STDOUT, PIPE, CalledProcessError
|
||||||
from psutil import virtual_memory, disk_usage
|
from psutil import virtual_memory, disk_usage
|
||||||
from html import escape as hescape
|
from html import escape as hescape
|
||||||
|
import pymupdf
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from .shared import getImageFileName, walkSort, walkLevel, sanitizeTrace, subprocess_run
|
from .shared import getImageFileName, walkSort, walkLevel, sanitizeTrace, subprocess_run
|
||||||
from .comicarchive import SEVENZIP, available_archive_tools
|
from .comicarchive import SEVENZIP, available_archive_tools
|
||||||
from . import comic2panel
|
from . import comic2panel
|
||||||
from . import image
|
from . import image
|
||||||
from . import comicarchive
|
from . import comicarchive
|
||||||
from . import pdfjpgextract
|
|
||||||
from . import dualmetafix
|
from . import dualmetafix
|
||||||
from . import metadata
|
from . import metadata
|
||||||
from . import kindle
|
from . import kindle
|
||||||
@@ -666,6 +667,141 @@ def imgFileProcessing(work):
|
|||||||
return str(sys.exc_info()[1]), sanitizeTrace(sys.exc_info()[2])
|
return str(sys.exc_info()[1]), sanitizeTrace(sys.exc_info()[2])
|
||||||
|
|
||||||
|
|
||||||
|
def render_page(vector):
|
||||||
|
"""Render a page range of a document.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
The PyMuPDF document cannot be part of the argument, because that
|
||||||
|
cannot be pickled. So we are being passed in just its filename.
|
||||||
|
This is no performance issue, because we are a separate process and
|
||||||
|
need to open the document anyway.
|
||||||
|
Any page-specific function can be processed here - rendering is just
|
||||||
|
an example - text extraction might be another.
|
||||||
|
The work must however be self-contained: no inter-process communication
|
||||||
|
or synchronization is possible with this design.
|
||||||
|
Care must also be taken with which parameters are contained in the
|
||||||
|
argument, because it will be passed in via pickling by the Pool class.
|
||||||
|
So any large objects will increase the overall duration.
|
||||||
|
Args:
|
||||||
|
vector: a list containing required parameters.
|
||||||
|
"""
|
||||||
|
# recreate the arguments
|
||||||
|
idx = vector[0] # this is the segment number we have to process
|
||||||
|
cpu = vector[1] # number of CPUs
|
||||||
|
filename = vector[2] # document filename
|
||||||
|
output_dir = vector[3]
|
||||||
|
target_height = vector[4]
|
||||||
|
try:
|
||||||
|
with pymupdf.open(filename) as doc: # open the document
|
||||||
|
num_pages = doc.page_count # get number of pages
|
||||||
|
|
||||||
|
# pages per segment: make sure that cpu * seg_size >= num_pages!
|
||||||
|
seg_size = int(num_pages / cpu + 1)
|
||||||
|
seg_from = idx * seg_size # our first page number
|
||||||
|
seg_to = min(seg_from + seg_size, num_pages) # last page number
|
||||||
|
|
||||||
|
for i in range(seg_from, seg_to): # work through our page segment
|
||||||
|
page = doc[i]
|
||||||
|
mat = target_height / page.rect.height
|
||||||
|
# TODO: decide colorspace earlier so later color check is cheaper.
|
||||||
|
pix = page.get_pixmap(matrix=mat, colorspace='RGB', alpha=False)
|
||||||
|
pix.save(os.path.join(output_dir, "p-%i.png" % i))
|
||||||
|
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
|
||||||
|
except Exception as e:
|
||||||
|
raise UserWarning(f"Error rendering {filename}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_page(vector):
|
||||||
|
"""For pages with single image (and no text). Otherwise it's recommended to use render_page()
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
The PyMuPDF document cannot be part of the argument, because that
|
||||||
|
cannot be pickled. So we are being passed in just its filename.
|
||||||
|
This is no performance issue, because we are a separate process and
|
||||||
|
need to open the document anyway.
|
||||||
|
Any page-specific function can be processed here - rendering is just
|
||||||
|
an example - text extraction might be another.
|
||||||
|
The work must however be self-contained: no inter-process communication
|
||||||
|
or synchronization is possible with this design.
|
||||||
|
Care must also be taken with which parameters are contained in the
|
||||||
|
argument, because it will be passed in via pickling by the Pool class.
|
||||||
|
So any large objects will increase the overall duration.
|
||||||
|
Args:
|
||||||
|
vector: a list containing required parameters.
|
||||||
|
"""
|
||||||
|
# recreate the arguments
|
||||||
|
idx = vector[0] # this is the segment number we have to process
|
||||||
|
cpu = vector[1] # number of CPUs
|
||||||
|
filename = vector[2] # document filename
|
||||||
|
output_dir = vector[3]
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pymupdf.open(filename) as doc: # open the document
|
||||||
|
num_pages = doc.page_count # get number of pages
|
||||||
|
|
||||||
|
# pages per segment: make sure that cpu * seg_size >= num_pages!
|
||||||
|
seg_size = int(num_pages / cpu + 1)
|
||||||
|
seg_from = idx * seg_size # our first page number
|
||||||
|
seg_to = min(seg_from + seg_size, num_pages) # last page number
|
||||||
|
|
||||||
|
for i in range(seg_from, seg_to): # work through our page segment
|
||||||
|
output_path = os.path.join(output_dir, "p-%i.png" % i)
|
||||||
|
page = doc.load_page(i)
|
||||||
|
image_list = page.get_images()
|
||||||
|
if len(image_list) > 1:
|
||||||
|
raise UserWarning("mupdf_pdf_extract_page_image() function can be used only with single image pages.")
|
||||||
|
if not image_list:
|
||||||
|
width, height = int(page.rect.width), int(page.rect.height)
|
||||||
|
blank_page = Image.new("RGB", (width, height), "white")
|
||||||
|
blank_page.save(output_path)
|
||||||
|
xref = image_list[0][0]
|
||||||
|
pix = pymupdf.Pixmap(doc, xref)
|
||||||
|
if pix.colorspace is None:
|
||||||
|
# It's a stencil mask (grayscale image with inverted colors)
|
||||||
|
mask_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width)
|
||||||
|
inverted = 255 - mask_array
|
||||||
|
img = Image.fromarray(inverted, mode="L")
|
||||||
|
img.save(output_path)
|
||||||
|
if pix.colorspace.name.startswith("Colorspace(CS_GRAY)"):
|
||||||
|
# Make sure that an image is just grayscale and not smth like "Colorspace(CS_GRAY) - Separation(DeviceCMYK,Black)"
|
||||||
|
pix = pymupdf.Pixmap(pymupdf.csGRAY, pix)
|
||||||
|
else:
|
||||||
|
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
||||||
|
if pix.alpha:
|
||||||
|
pix = pymupdf.Pixmap(pix, alpha=0)
|
||||||
|
pix.save(output_path)
|
||||||
|
print("Processed page numbers %i through %i" % (seg_from, seg_to - 1))
|
||||||
|
except Exception as e:
|
||||||
|
raise UserWarning(f"Error exporting {filename}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def mupdf_pdf_process_pages_parallel(filename, output_dir, target_height):
|
||||||
|
render = False
|
||||||
|
with pymupdf.open(filename) as doc:
|
||||||
|
for page in doc:
|
||||||
|
page_text = page.get_text().strip()
|
||||||
|
if page_text != "":
|
||||||
|
render = True
|
||||||
|
break
|
||||||
|
if len(page.get_images()) > 1:
|
||||||
|
render = True
|
||||||
|
break
|
||||||
|
|
||||||
|
cpu = cpu_count()
|
||||||
|
|
||||||
|
# make vectors of arguments for the processes
|
||||||
|
vectors = [(i, cpu, filename, output_dir, target_height) for i in range(cpu)]
|
||||||
|
print("Starting %i processes for '%s'." % (cpu, filename))
|
||||||
|
|
||||||
|
try:
|
||||||
|
with Pool(processes=cpu_count()-1) as pool:
|
||||||
|
results = pool.map(
|
||||||
|
render_page if render else extract_page, vectors
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise UserWarning(f"Error while processing PDF pages: {e}")
|
||||||
|
|
||||||
|
|
||||||
def getWorkFolder(afile):
|
def getWorkFolder(afile):
|
||||||
if os.path.isdir(afile):
|
if os.path.isdir(afile):
|
||||||
if disk_usage(gettempdir())[2] < getDirectorySize(afile) * 2.5:
|
if disk_usage(gettempdir())[2] < getDirectorySize(afile) * 2.5:
|
||||||
@@ -684,13 +820,19 @@ def getWorkFolder(afile):
|
|||||||
if disk_usage(gettempdir())[2] < os.path.getsize(afile) * 2.5:
|
if disk_usage(gettempdir())[2] < os.path.getsize(afile) * 2.5:
|
||||||
raise UserWarning("Not enough disk space to perform conversion.")
|
raise UserWarning("Not enough disk space to perform conversion.")
|
||||||
if afile.lower().endswith('.pdf'):
|
if afile.lower().endswith('.pdf'):
|
||||||
pdf = pdfjpgextract.PdfJpgExtract(afile)
|
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
|
||||||
path, njpg = pdf.extract()
|
path = workdir
|
||||||
workdir = path
|
|
||||||
sanitizePermissions(path)
|
sanitizePermissions(path)
|
||||||
if njpg == 0:
|
target_height = options.profileData[1][1]
|
||||||
|
if options.cropping == 1:
|
||||||
|
target_height = target_height + target_height*0.20 #Account for possible margin at the top and bottom
|
||||||
|
elif options.cropping == 2:
|
||||||
|
target_height = target_height + target_height*0.25 #Account for possible margin at the top and bottom with page number
|
||||||
|
try:
|
||||||
|
mupdf_pdf_process_pages_parallel(afile, workdir, target_height)
|
||||||
|
except Exception as e:
|
||||||
rmtree(path, True)
|
rmtree(path, True)
|
||||||
raise UserWarning("Failed to extract images from PDF file.")
|
raise UserWarning(f"Failed to extract images from PDF file. {e}")
|
||||||
else:
|
else:
|
||||||
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
|
workdir = mkdtemp('', 'KCC-', os.path.dirname(afile))
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -1,79 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
#
|
|
||||||
# Copyright (c) 2012-2014 Ciro Mattia Gonano <ciromattia@gmail.com>
|
|
||||||
# Copyright (c) 2013-2019 Pawel Jastrzebski <pawelj@iosphe.re>
|
|
||||||
#
|
|
||||||
# Based upon the code snippet by Ned Batchelder
|
|
||||||
# (http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html)
|
|
||||||
#
|
|
||||||
# Permission to use, copy, modify, and/or distribute this software for
|
|
||||||
# any purpose with or without fee is hereby granted, provided that the
|
|
||||||
# above copyright notice and this permission notice appear in all
|
|
||||||
# copies.
|
|
||||||
#
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
|
||||||
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
|
||||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
|
||||||
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
|
|
||||||
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
|
|
||||||
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
||||||
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
||||||
# PERFORMANCE OF THIS SOFTWARE.
|
|
||||||
#
|
|
||||||
|
|
||||||
import os
|
|
||||||
from random import choice
|
|
||||||
from string import ascii_uppercase, digits
|
|
||||||
|
|
||||||
# skip stray images a few pixels in size in some PDFs
|
|
||||||
# typical images are many thousands in length
|
|
||||||
# https://github.com/ciromattia/kcc/pull/546
|
|
||||||
STRAY_IMAGE_LENGTH_THRESHOLD = 300
|
|
||||||
|
|
||||||
|
|
||||||
class PdfJpgExtract:
|
|
||||||
def __init__(self, fname):
|
|
||||||
self.fname = fname
|
|
||||||
self.filename = os.path.splitext(fname)
|
|
||||||
self.path = self.filename[0] + "-KCC-" + ''.join(choice(ascii_uppercase + digits) for _ in range(3))
|
|
||||||
|
|
||||||
def getPath(self):
|
|
||||||
return self.path
|
|
||||||
|
|
||||||
def extract(self):
|
|
||||||
pdf = open(self.fname, "rb").read()
|
|
||||||
startmark = b"\xff\xd8"
|
|
||||||
startfix = 0
|
|
||||||
endmark = b"\xff\xd9"
|
|
||||||
endfix = 2
|
|
||||||
i = 0
|
|
||||||
njpg = 0
|
|
||||||
os.makedirs(self.path)
|
|
||||||
while True:
|
|
||||||
istream = pdf.find(b"stream", i)
|
|
||||||
if istream < 0:
|
|
||||||
break
|
|
||||||
istart = pdf.find(startmark, istream, istream + 20)
|
|
||||||
if istart < 0:
|
|
||||||
i = istream + 20
|
|
||||||
continue
|
|
||||||
iend = pdf.find(b"endstream", istart)
|
|
||||||
if iend < 0:
|
|
||||||
raise Exception("Didn't find end of stream!")
|
|
||||||
iend = pdf.find(endmark, iend - 20)
|
|
||||||
if iend < 0:
|
|
||||||
raise Exception("Didn't find end of JPG!")
|
|
||||||
istart += startfix
|
|
||||||
iend += endfix
|
|
||||||
i = iend
|
|
||||||
|
|
||||||
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
|
|
||||||
continue
|
|
||||||
|
|
||||||
jpg = pdf[istart:iend]
|
|
||||||
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
|
|
||||||
jpgfile.write(jpg)
|
|
||||||
jpgfile.close()
|
|
||||||
njpg += 1
|
|
||||||
|
|
||||||
return self.path, njpg
|
|
||||||
@@ -9,3 +9,4 @@ mozjpeg-lossless-optimization>=1.2.0
|
|||||||
natsort>=8.4.0
|
natsort>=8.4.0
|
||||||
distro>=1.8.0
|
distro>=1.8.0
|
||||||
numpy>=1.22.4
|
numpy>=1.22.4
|
||||||
|
PyMuPDF>=1.26.1
|
||||||
|
|||||||
Reference in New Issue
Block a user