mirror of
https://github.com/sismics/docs.git
synced 2025-12-16 03:06:22 +00:00
Closes #373: high quality PDF to image conversion before OCR
This commit is contained in:
@@ -6,6 +6,7 @@ import com.sismics.util.mime.MimeType;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.multipdf.PDFMergerUtility;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.slf4j.Logger;
|
||||
@@ -60,7 +61,7 @@ public class PdfFormatHandler implements FormatHandler {
|
||||
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
|
||||
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
|
||||
sb.append(" ");
|
||||
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
|
||||
sb.append(FileUtil.ocrFile(language, renderer.renderImageWithDPI(pageIndex, 300, ImageType.GRAY)));
|
||||
}
|
||||
return sb.toString();
|
||||
} catch (Exception e) {
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.sismics.util.format;
|
||||
|
||||
import com.sismics.docs.core.util.format.PdfFormatHandler;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public class TestPdfFormatHandler {
|
||||
@Test
|
||||
public void testIssue373() throws Exception {
|
||||
PdfFormatHandler formatHandler = new PdfFormatHandler();
|
||||
String content = formatHandler.extractContent("deu", Paths.get(ClassLoader.getSystemResource("file/issue373.pdf").toURI()));
|
||||
Assert.assertTrue(content.contains("Aufrechterhaltung"));
|
||||
Assert.assertTrue(content.contains("Außentemperatur"));
|
||||
Assert.assertTrue(content.contains("Grundumsatzmessungen"));
|
||||
Assert.assertTrue(content.contains("ermitteln"));
|
||||
}
|
||||
}
|
||||
BIN
docs-core/src/test/resources/file/issue373.pdf
Normal file
BIN
docs-core/src/test/resources/file/issue373.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user