mirror of
https://github.com/ciromattia/kcc
synced 2025-12-12 17:26:23 +00:00
skip single pixel images in PDF (#546)
* skip pixels hopefully * add comments and reorder * add constant
This commit is contained in:
@@ -25,6 +25,11 @@ import os
|
||||
from random import choice
|
||||
from string import ascii_uppercase, digits
|
||||
|
||||
# skip stray images a few pixels in size in some PDFs
|
||||
# typical images are many thousands in length
|
||||
# https://github.com/ciromattia/kcc/pull/546
|
||||
STRAY_IMAGE_LENGTH_THRESHOLD = 300
|
||||
|
||||
|
||||
class PdfJpgExtract:
|
||||
def __init__(self, fname):
|
||||
@@ -60,10 +65,15 @@ class PdfJpgExtract:
|
||||
raise Exception("Didn't find end of JPG!")
|
||||
istart += startfix
|
||||
iend += endfix
|
||||
i = iend
|
||||
|
||||
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
|
||||
continue
|
||||
|
||||
jpg = pdf[istart:iend]
|
||||
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
|
||||
jpgfile.write(jpg)
|
||||
jpgfile.close()
|
||||
njpg += 1
|
||||
i = iend
|
||||
|
||||
return self.path, njpg
|
||||
|
||||
Reference in New Issue
Block a user