1
0
mirror of https://github.com/ciromattia/kcc synced 2025-12-13 09:46:25 +00:00

skip single pixel images in PDF (#546)

* skip pixels hopefully

* add comments and reorder

* add constant
This commit is contained in:
Alex Xu
2023-08-05 08:37:00 -07:00
committed by GitHub
parent 154707a412
commit 9339abb267

View File

@@ -25,6 +25,11 @@ import os
from random import choice
from string import ascii_uppercase, digits
# skip stray images a few pixels in size in some PDFs
# typical images are many thousands in length
# https://github.com/ciromattia/kcc/pull/546
STRAY_IMAGE_LENGTH_THRESHOLD = 300
class PdfJpgExtract:
def __init__(self, fname):
@@ -60,10 +65,15 @@ class PdfJpgExtract:
raise Exception("Didn't find end of JPG!")
istart += startfix
iend += endfix
i = iend
if iend - istart < STRAY_IMAGE_LENGTH_THRESHOLD:
continue
jpg = pdf[istart:iend]
jpgfile = open(self.path + "/jpg%d.jpg" % njpg, "wb")
jpgfile.write(jpg)
jpgfile.close()
njpg += 1
i = iend
return self.path, njpg