mirror of
https://github.com/sismics/docs.git
synced 2025-12-23 14:41:40 +00:00
#186: ocr pdf if it contains no text
This commit is contained in:
@@ -27,7 +27,6 @@ public class FileCreatedAsyncListener {
|
||||
* File created.
|
||||
*
|
||||
* @param fileCreatedAsyncEvent File created event
|
||||
* @throws Exception e
|
||||
*/
|
||||
@Subscribe
|
||||
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) {
|
||||
|
||||
@@ -58,31 +58,22 @@ public class FileUtil {
|
||||
} else if (VideoUtil.isVideo(file.getMimeType())) {
|
||||
content = VideoUtil.getMetadata(unencryptedFile);
|
||||
} else if (unencryptedPdfFile != null) {
|
||||
content = PdfUtil.extractPdf(unencryptedPdfFile);
|
||||
content = PdfUtil.extractPdf(unencryptedPdfFile, language);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Optical character recognition on a file.
|
||||
*
|
||||
* @param unecryptedFile Unencrypted file
|
||||
* Optical character recognition on an image.
|
||||
*
|
||||
* @param image Buffered image
|
||||
* @param language Language to OCR
|
||||
* @return Content extracted
|
||||
*/
|
||||
private static String ocrFile(Path unecryptedFile, String language) {
|
||||
Tesseract instance = Tesseract.getInstance();
|
||||
String content = null;
|
||||
BufferedImage image;
|
||||
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
||||
image = ImageIO.read(inputStream);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String ocrFile(BufferedImage image, String language) {
|
||||
// Upscale, grayscale and deskew the image
|
||||
String content = null;
|
||||
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
|
||||
image.flush();
|
||||
ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
|
||||
@@ -92,15 +83,35 @@ public class FileUtil {
|
||||
|
||||
// OCR the file
|
||||
try {
|
||||
Tesseract instance = Tesseract.getInstance();
|
||||
log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
|
||||
instance.setLanguage(language);
|
||||
content = instance.doOCR(image);
|
||||
} catch (Throwable e) {
|
||||
log.error("Error while OCR-izing the image", e);
|
||||
}
|
||||
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optical character recognition on a file.
|
||||
*
|
||||
* @param unecryptedFile Unencrypted file
|
||||
* @param language Language to OCR
|
||||
* @return Content extracted
|
||||
*/
|
||||
private static String ocrFile(Path unecryptedFile, String language) {
|
||||
BufferedImage image;
|
||||
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
|
||||
image = ImageIO.read(inputStream);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
return ocrFile(image, language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a file on the storage filesystem.
|
||||
|
||||
@@ -59,24 +59,31 @@ public class PdfUtil {
|
||||
* Extract text from a PDF.
|
||||
*
|
||||
* @param unencryptedPdfFile Unencrypted PDF file
|
||||
* @param language Language
|
||||
* @return Content extracted
|
||||
*/
|
||||
public static String extractPdf(Path unencryptedPdfFile) {
|
||||
public static String extractPdf(Path unencryptedPdfFile, String language) {
|
||||
String content = null;
|
||||
PDDocument pdfDocument = null;
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile)) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
pdfDocument = PDDocument.load(inputStream);
|
||||
content = stripper.getText(pdfDocument);
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
content = new PDFTextStripper().getText(pdfDocument);
|
||||
} catch (Exception e) {
|
||||
log.error("Error while extracting text from the PDF", e);
|
||||
} finally {
|
||||
if (pdfDocument != null) {
|
||||
try {
|
||||
pdfDocument.close();
|
||||
} catch (IOException e) {
|
||||
// NOP
|
||||
}
|
||||
|
||||
// No text content, try to OCR it
|
||||
if (language != null && content != null && content.trim().isEmpty()) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
|
||||
PDDocument pdfDocument = PDDocument.load(inputStream)) {
|
||||
PDFRenderer renderer = new PDFRenderer(pdfDocument);
|
||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||
sb.append(" ");
|
||||
sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
|
||||
}
|
||||
return sb.toString();
|
||||
} catch (Exception e) {
|
||||
log.error("Error while OCR-izing the PDF", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user