mirror of
https://github.com/sismics/docs.git
synced 2025-12-15 10:46:26 +00:00
Index and generate thumbnails from PDF
This commit is contained in:
@@ -117,6 +117,11 @@
|
||||
<artifactId>imgscalr-lib</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- OCR dependencies -->
|
||||
<dependency>
|
||||
<groupId>jna</groupId>
|
||||
|
||||
@@ -3,11 +3,11 @@ package com.sismics.docs.core.event;
|
||||
import com.google.common.base.Objects;
|
||||
|
||||
/**
|
||||
* OCR all files in database event.
|
||||
* Extract file content event.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class OcrFileAsyncEvent {
|
||||
public class ExtractFileAsyncEvent {
|
||||
@Override
|
||||
public String toString() {
|
||||
return Objects.toStringHelper(this)
|
||||
@@ -9,33 +9,33 @@ import org.slf4j.LoggerFactory;
|
||||
import com.google.common.eventbus.Subscribe;
|
||||
import com.sismics.docs.core.dao.jpa.DocumentDao;
|
||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||
import com.sismics.docs.core.event.OcrFileAsyncEvent;
|
||||
import com.sismics.docs.core.event.ExtractFileAsyncEvent;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.docs.core.util.TransactionUtil;
|
||||
|
||||
/**
|
||||
* Listener on OCR all files in database.
|
||||
* Listener on extract content from all files.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class OcrFileAsyncListener {
|
||||
public class ExtractFileAsyncListener {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(OcrFileAsyncListener.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(ExtractFileAsyncListener.class);
|
||||
|
||||
/**
|
||||
* OCR all files.
|
||||
* Extract content from all files.
|
||||
*
|
||||
* @param ocrFileAsyncEvent OCR all files in database event
|
||||
* @param extractFileAsyncEvent Extract file content event
|
||||
* @throws Exception
|
||||
*/
|
||||
@Subscribe
|
||||
public void on(final OcrFileAsyncEvent ocrFileAsyncEvent) throws Exception {
|
||||
public void on(final ExtractFileAsyncEvent extractFileAsyncEvent) throws Exception {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("OCR all files in database event: " + ocrFileAsyncEvent.toString());
|
||||
log.info("Extract file content event: " + extractFileAsyncEvent.toString());
|
||||
}
|
||||
|
||||
TransactionUtil.handle(new Runnable() {
|
||||
@@ -47,10 +47,9 @@ public class OcrFileAsyncListener {
|
||||
for (File file : fileList) {
|
||||
long startTime = System.currentTimeMillis();
|
||||
Document document = documentDao.getById(file.getDocumentId());
|
||||
String content = FileUtil.ocrFile(document, file);
|
||||
file.setContent(content);
|
||||
file.setContent(FileUtil.extractContent(document, file));
|
||||
TransactionUtil.commit();
|
||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -39,7 +39,7 @@ public class FileCreatedAsyncListener {
|
||||
// OCR the file
|
||||
final File file = fileCreatedAsyncEvent.getFile();
|
||||
long startTime = System.currentTimeMillis();
|
||||
final String content = FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), file);
|
||||
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file);
|
||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||
|
||||
// Store the OCR-ization result in the database
|
||||
|
||||
@@ -16,7 +16,7 @@ import com.sismics.docs.core.listener.async.DocumentDeletedAsyncListener;
|
||||
import com.sismics.docs.core.listener.async.DocumentUpdatedAsyncListener;
|
||||
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
||||
import com.sismics.docs.core.listener.async.FileDeletedAsyncListener;
|
||||
import com.sismics.docs.core.listener.async.OcrFileAsyncListener;
|
||||
import com.sismics.docs.core.listener.async.ExtractFileAsyncListener;
|
||||
import com.sismics.docs.core.listener.async.RebuildIndexAsyncListener;
|
||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||
import com.sismics.docs.core.model.jpa.Config;
|
||||
@@ -82,7 +82,7 @@ public class AppContext {
|
||||
asyncEventBus.register(new DocumentUpdatedAsyncListener());
|
||||
asyncEventBus.register(new DocumentDeletedAsyncListener());
|
||||
asyncEventBus.register(new RebuildIndexAsyncListener());
|
||||
asyncEventBus.register(new OcrFileAsyncListener());
|
||||
asyncEventBus.register(new ExtractFileAsyncListener());
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -6,11 +6,15 @@ import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import net.sourceforge.tess4j.Tesseract;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.util.PDFTextStripper;
|
||||
import org.imgscalr.Scalr;
|
||||
import org.imgscalr.Scalr.Method;
|
||||
import org.imgscalr.Scalr.Mode;
|
||||
@@ -20,6 +24,7 @@ import org.slf4j.LoggerFactory;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.util.ImageUtil;
|
||||
import com.sismics.util.mime.MimeType;
|
||||
|
||||
/**
|
||||
* File entity utilities.
|
||||
@@ -33,18 +38,32 @@ public class FileUtil {
|
||||
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
|
||||
|
||||
/**
|
||||
* OCR a file.
|
||||
* Extract content from a file.
|
||||
*
|
||||
* @param document Document linked to the file
|
||||
* @param file File to extract
|
||||
* @return Content extract
|
||||
*/
|
||||
public static String extractContent(Document document, File file) {
|
||||
String content = null;
|
||||
|
||||
if (ImageUtil.isImage(file.getMimeType())) {
|
||||
content = ocrFile(document, file);
|
||||
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||
content = extractPdf(file);
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Optical character recognition on a file.
|
||||
*
|
||||
* @param document Document linked to the file
|
||||
* @param file File to OCR
|
||||
* @return OCR-ized content
|
||||
* @return Content extracted
|
||||
*/
|
||||
public static String ocrFile(Document document, final File file) {
|
||||
if (!ImageUtil.isImage(file.getMimeType())) {
|
||||
// The file is not OCR-izable
|
||||
return null;
|
||||
}
|
||||
|
||||
private static String ocrFile(Document document, File file) {
|
||||
Tesseract instance = Tesseract.getInstance();
|
||||
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||
String content = null;
|
||||
@@ -72,6 +91,35 @@ public class FileUtil {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from a PDF.
|
||||
*
|
||||
* @param file File to extract
|
||||
* @return Content extracted
|
||||
*/
|
||||
private static String extractPdf(File file) {
|
||||
String content = null;
|
||||
PDDocument pdfDocument = null;
|
||||
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||
try {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
pdfDocument = PDDocument.load(storedfile);
|
||||
content = stripper.getText(pdfDocument);
|
||||
} catch (IOException e) {
|
||||
log.error("Error while extracting text from the PDF " + storedfile, e);
|
||||
} finally {
|
||||
if (pdfDocument != null) {
|
||||
try {
|
||||
pdfDocument.close();
|
||||
} catch (IOException e) {
|
||||
// NOP
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a file on the storage filesystem.
|
||||
*
|
||||
@@ -84,7 +132,12 @@ public class FileUtil {
|
||||
Files.copy(is, path);
|
||||
|
||||
// Generate file variations
|
||||
saveVariations(file, path.toFile());
|
||||
try {
|
||||
saveVariations(file, path.toFile());
|
||||
} catch (IOException e) {
|
||||
// Don't rethrow Exception from file variations generation
|
||||
log.error("Error creating file variations", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -95,8 +148,22 @@ public class FileUtil {
|
||||
* @throws IOException
|
||||
*/
|
||||
public static void saveVariations(File file, java.io.File originalFile) throws IOException {
|
||||
BufferedImage image = null;
|
||||
if (ImageUtil.isImage(file.getMimeType())) {
|
||||
BufferedImage image = ImageIO.read(originalFile);
|
||||
image = ImageIO.read(originalFile);
|
||||
} else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
|
||||
// Generate preview from the first page of the PDF
|
||||
PDDocument pdfDocument = PDDocument.load(originalFile);
|
||||
@SuppressWarnings("unchecked")
|
||||
List<PDPage> pageList = pdfDocument.getDocumentCatalog().getAllPages();
|
||||
if (pageList.size() > 0) {
|
||||
PDPage page = pageList.get(0);
|
||||
image = page.convertToImage();
|
||||
}
|
||||
}
|
||||
|
||||
if (image != null) {
|
||||
// Generate thumbnails from image
|
||||
BufferedImage web = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 1280, Scalr.OP_ANTIALIAS);
|
||||
BufferedImage thumbnail = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 256, Scalr.OP_ANTIALIAS);
|
||||
image.flush();
|
||||
|
||||
Reference in New Issue
Block a user