1
0
mirror of https://github.com/sismics/docs.git synced 2025-12-23 14:41:40 +00:00

#186: ocr pdf if it contains no text

This commit is contained in:
Benjamin Gamard
2018-03-12 11:12:48 +01:00
parent a66a1e6f8e
commit 647ad841df
5 changed files with 69 additions and 33 deletions

View File

@@ -27,7 +27,6 @@ public class FileCreatedAsyncListener {
* File created.
*
* @param fileCreatedAsyncEvent File created event
* @throws Exception e
*/
@Subscribe
public void on(final FileCreatedAsyncEvent fileCreatedAsyncEvent) {

View File

@@ -58,31 +58,22 @@ public class FileUtil {
} else if (VideoUtil.isVideo(file.getMimeType())) {
content = VideoUtil.getMetadata(unencryptedFile);
} else if (unencryptedPdfFile != null) {
content = PdfUtil.extractPdf(unencryptedPdfFile);
content = PdfUtil.extractPdf(unencryptedPdfFile, language);
}
return content;
}
/**
* Optical character recognition on a file.
*
* @param unecryptedFile Unencrypted file
* Optical character recognition on an image.
*
* @param image Buffered image
* @param language Language to OCR
* @return Content extracted
*/
private static String ocrFile(Path unecryptedFile, String language) {
Tesseract instance = Tesseract.getInstance();
String content = null;
BufferedImage image;
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
image = ImageIO.read(inputStream);
} catch (IOException e) {
log.error("Error reading the image", e);
return null;
}
public static String ocrFile(BufferedImage image, String language) {
// Upscale, grayscale and deskew the image
String content = null;
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
image.flush();
ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
@@ -92,15 +83,35 @@ public class FileUtil {
// OCR the file
try {
Tesseract instance = Tesseract.getInstance();
log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
instance.setLanguage(language);
content = instance.doOCR(image);
} catch (Throwable e) {
log.error("Error while OCR-izing the image", e);
}
return content;
}
/**
* Optical character recognition on a file.
*
* @param unecryptedFile Unencrypted file
* @param language Language to OCR
* @return Content extracted
*/
private static String ocrFile(Path unecryptedFile, String language) {
BufferedImage image;
try (InputStream inputStream = Files.newInputStream(unecryptedFile)) {
image = ImageIO.read(inputStream);
} catch (IOException e) {
log.error("Error reading the image", e);
return null;
}
return ocrFile(image, language);
}
/**
* Save a file on the storage filesystem.

View File

@@ -59,24 +59,31 @@ public class PdfUtil {
* Extract text from a PDF.
*
* @param unencryptedPdfFile Unencrypted PDF file
* @param language Language
* @return Content extracted
*/
public static String extractPdf(Path unencryptedPdfFile) {
public static String extractPdf(Path unencryptedPdfFile, String language) {
String content = null;
PDDocument pdfDocument = null;
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile)) {
PDFTextStripper stripper = new PDFTextStripper();
pdfDocument = PDDocument.load(inputStream);
content = stripper.getText(pdfDocument);
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
content = new PDFTextStripper().getText(pdfDocument);
} catch (Exception e) {
log.error("Error while extracting text from the PDF", e);
} finally {
if (pdfDocument != null) {
try {
pdfDocument.close();
} catch (IOException e) {
// NOP
}
// No text content, try to OCR it
if (language != null && content != null && content.trim().isEmpty()) {
StringBuilder sb = new StringBuilder();
try (InputStream inputStream = Files.newInputStream(unencryptedPdfFile);
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
sb.append(" ");
sb.append(FileUtil.ocrFile(renderer.renderImage(pageIndex), language));
}
return sb.toString();
} catch (Exception e) {
log.error("Error while OCR-izing the PDF", e);
}
}