#186: ocr pdf if it contains no text

2026-02-06 20:31:45 +00:00 · 2018-03-12 11:12:48 +01:00
parent a66a1e6f8e
commit 647ad841df
5 changed files with 69 additions and 33 deletions
--- a/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/listener/async/FileCreatedAsyncListener.java
@@ -27,7 +27,6 @@ public class FileCreatedAsyncListener {
     * File created.
     * 
     * @param fileCreatedAsyncEvent File created event
-     * @throws Exception e
     */
    @Subscribe
    public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) {
--- a/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
@@ -58,31 +58,22 @@ public class FileUtil {
        } else if (VideoUtil.isVideo(file.getMimeType())) {
            content = VideoUtil.getMetadata(unencryptedFile);
        } else if (unencryptedPdfFile != null) {
-            content = PdfUtil.extractPdf(unencryptedPdfFile);
+            content = PdfUtil.extractPdf(unencryptedPdfFile, language);
        }
        
        return content;
    }
-    
+
    /**
-     * Optical character recognition on a file.
-     * 
-     * @param unecryptedFile Unencrypted file
+     * Optical character recognition on an image.
+     *
+     * @param image Buffered image
     * @param language Language to OCR
     * @return Content extracted
     */
-    private static String ocrFile(Path unecryptedFile, String language) {
-        Tesseract instance = Tesseract.getInstance();
-        String content = null;
-        BufferedImage image;
-        try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
-            image = ImageIO.read(inputStream);
-        } catch (IOException e) {
-            log.error("Error reading the image", e);
-            return null;
-        }
-        
+    public static String ocrFile(BufferedImage image, String language) {
        // Upscale, grayscale and deskew the image
+        String content = null;
        BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
        image.flush();
        ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
@@ -92,15 +83,35 @@ public class FileUtil {

        // OCR the file
        try {
+            Tesseract instance = Tesseract.getInstance();
            log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
            instance.setLanguage(language);
            content = instance.doOCR(image);
        } catch (Throwable e) {
            log.error("Error while OCR-izing the image", e);
        }
-        
+
        return content;
    }
+
+    /**
+     * Optical character recognition on a file.
+     *
+     * @param unecryptedFile Unencrypted file
+     * @param language Language to OCR
+     * @return Content extracted
+     */
+    private static String ocrFile(Path unecryptedFile, String language) {
+        BufferedImage image;
+        try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
+            image = ImageIO.read(inputStream);
+        } catch (IOException e) {
+            log.error("Error reading the image", e);
+            return null;
+        }
+
+        return ocrFile(image, language);
+    }
    
    /**
     * Save a file on the storage filesystem.
--- a/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java
+++ b/docs-core/src/main/java/com/sismics/docs/core/util/PdfUtil.java
@@ -59,24 +59,31 @@ public class PdfUtil {
     * Extract text from a PDF.
     * 
     * @param unencryptedPdfFile Unencrypted PDF file
+     * @param language Language
     * @return Content extracted
     */
-    public static String extractPdf(Path unencryptedPdfFile) {
+    public static String extractPdf(Path unencryptedPdfFile, String language) {
        String content = null;
-        PDDocument pdfDocument = null;
-        try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile)) {
-            PDFTextStripper stripper = new PDFTextStripper();
-            pdfDocument = PDDocument.load(inputStream);
-            content = stripper.getText(pdfDocument);
+        try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
+             PDDocument pdfDocument = PDDocument.load(inputStream)) {
+            content = new PDFTextStripper().getText(pdfDocument);
        } catch (Exception e) {
            log.error("Error while extracting text from the PDF", e);
-        } finally {
-            if (pdfDocument != null) {
-                try {
-                    pdfDocument.close();
-                } catch (IOException e) {
-                    // NOP
+        }
+
+        // No text content, try to OCR it
+        if (language != null && content != null && content.trim().isEmpty()) {
+            StringBuilder sb = new StringBuilder();
+            try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
+                 PDDocument pdfDocument = PDDocument.load(inputStream)) {
+                PDFRenderer renderer = new PDFRenderer(pdfDocument);
+                for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
+                    sb.append(" ");
+                    sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
                }
+                return sb.toString();
+            } catch (Exception e) {
+                log.error("Error while OCR-izing the PDF", e);
            }
        }