mirror of
https://github.com/ciromattia/kcc
synced 2025-12-23 22:51:45 +00:00
skip single pixel images in PDF (#546)
* skip pixels hopefully * add comments and reorder * add constant
This commit is contained in:
@@ -25,6 +25,11 @@ import os
|
|||||||
from random import choice
|
from random import choice
|
||||||
from string import ascii_uppercase, digits
|
from string import ascii_uppercase, digits
|
||||||
|
|
||||||
|
# skip stray images a few pixels in size in some PDFs
|
||||||
|
# typical images are many thousands in length
|
||||||
|
# https://github.com/ciromattia/kcc/pull/546
|
||||||
|
STRAY_IMAGE_LENGTH_THRESHOLD = 300
|
||||||
|
|
||||||
|
|
||||||
class PdfJpgExtract:
|
class PdfJpgExtract:
|
||||||
def __init__(self, fname):
|
def __init__(self, fname):
|
||||||
@@ -60,10 +65,15 @@ class PdfJpgExtract:
|
|||||||
raise Exception("Didn't find end of JPG!")
|
raise Exception("Didn't find end of JPG!")
|
||||||
istart += startfix
|
istart += startfix
|
||||||
iend += endfix
|
iend += endfix
|
||||||
|
i = iend
|
||||||
|
|
||||||
|
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
|
||||||
|
continue
|
||||||
|
|
||||||
jpg = pdf[istart:iend]
|
jpg = pdf[istart:iend]
|
||||||
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
|
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
|
||||||
jpgfile.write(jpg)
|
jpgfile.write(jpg)
|
||||||
jpgfile.close()
|
jpgfile.close()
|
||||||
njpg += 1
|
njpg += 1
|
||||||
i = iend
|
|
||||||
return self.path, njpg
|
return self.path, njpg
|
||||||
|
|||||||
Reference in New Issue
Block a user