1
0
mirror of https://github.com/sismics/docs.git synced 2025-12-13 09:46:17 +00:00

Closes #53: Build thumbnails for DOCX and ODT files

This commit is contained in:
jendib
2015-12-11 22:00:44 +01:00
parent 1a37d97a61
commit 7708f61343
10 changed files with 228 additions and 118 deletions

View File

@@ -28,58 +28,43 @@ public class FileCreatedAsyncEvent {
private InputStream inputStream;
/**
* Getter of file.
*
* @return the file
* Unencrypted input stream containing a PDF representation
* of the file. May be null if the PDF conversion is not
* necessary or not possible.
*/
private InputStream pdfInputStream;
public File getFile() {
return file;
}
/**
* Setter of file.
*
* @param file file
*/
public void setFile(File file) {
this.file = file;
}
/**
* Getter of document.
*
* @return the document
*/
public Document getDocument() {
return document;
}
/**
* Setter of document.
*
* @param document document
*/
public void setDocument(Document document) {
this.document = document;
}
/**
* Getter of inputStream.
*
* @return the inputStream
*/
public InputStream getInputStream() {
return inputStream;
}
/**
* Setter de inputStream.
*
* @param inputStream inputStream
*/
public void setInputStream(InputStream inputStream) {
this.inputStream = inputStream;
}
public InputStream getPdfInputStream() {
return pdfInputStream;
}
public void setPdfInputStream(InputStream pdfInputStream) {
this.pdfInputStream = pdfInputStream;
}
@Override
public String toString() {

View File

@@ -12,7 +12,6 @@ import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.TransactionUtil;
import com.sismics.util.mime.MimeTypeUtil;
/**
* Listener on file created.
@@ -39,12 +38,15 @@ public class FileCreatedAsyncListener {
// Guess the mime type a second time, for open document format (first detected as simple ZIP file)
final File file = fileCreatedAsyncEvent.getFile();
file.setMimeType(MimeTypeUtil.guessOpenDocumentFormat(file, fileCreatedAsyncEvent.getInputStream()));
// Extract text content from the file
long startTime = System.currentTimeMillis();
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file, fileCreatedAsyncEvent.getInputStream());
final String content = FileUtil.extractContent(fileCreatedAsyncEvent.getDocument(), file,
fileCreatedAsyncEvent.getInputStream(), fileCreatedAsyncEvent.getPdfInputStream());
fileCreatedAsyncEvent.getInputStream().close();
if (fileCreatedAsyncEvent.getPdfInputStream() != null) {
fileCreatedAsyncEvent.getPdfInputStream().close();
}
log.info(MessageFormat.format("File content extracted in {0}ms", System.currentTimeMillis() - startTime));
// Store the text content in the database

View File

@@ -1,6 +1,8 @@
package com.sismics.docs.core.util;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@@ -48,19 +50,16 @@ public class FileUtil {
* @param document Document linked to the file
* @param file File to extract
* @param inputStream Unencrypted input stream
* @param pdfInputStream Unencrypted PDF input stream
* @return Content extract
*/
public static String extractContent(Document document, File file, InputStream inputStream) {
public static String extractContent(Document document, File file, InputStream inputStream, InputStream pdfInputStream) {
String content = null;
if (ImageUtil.isImage(file.getMimeType())) {
content = ocrFile(inputStream, document);
} else if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
content = extractPdf(inputStream);
} else if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
content = extractOpenDocumentText(inputStream);
} else if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
content = extractOfficeDocument(inputStream);
} else if (pdfInputStream != null) {
content = extractPdf(pdfInputStream);
}
return content;
@@ -129,92 +128,80 @@ public class FileUtil {
}
/**
* Extract text from an open document text file.
* Convert a file to PDF if necessary.
*
* @param inputStream Unencrypted input stream
* @return Content extracted
* @param inputStream InputStream
* @param file File
* @return PDF input stream
* @throws Exception
*/
private static String extractOpenDocumentText(InputStream inputStream) {
String content = null;
Path tempFile = null;
try {
// Convert the ODT file to a temporary PDF file
tempFile = Files.createTempFile("sismicsdocs_", ".pdf");
try (OutputStream out = Files.newOutputStream(tempFile)) {
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, out, options);
}
// Extract content from the PDF file
try (InputStream pdfInputStream = Files.newInputStream(tempFile)) {
content = extractPdf(pdfInputStream);
}
} catch (Exception e) {
log.error("Error while extracting text from the ODT", e);
} finally {
try {
Files.delete(tempFile); // Delete the temporary PDF file
} catch (IOException e) {
// Should not happen
}
public static InputStream convertToPdf(InputStream inputStream, File file) throws Exception {
if (file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
// It's already PDF, just return the input
return inputStream;
}
return content;
if (file.getMimeType().equals(MimeType.OFFICE_DOCUMENT)) {
return convertOfficeDocument(inputStream);
}
if (file.getMimeType().equals(MimeType.OPEN_DOCUMENT_TEXT)) {
return convertOpenDocumentText(inputStream);
}
// PDF conversion not necessary/possible
return null;
}
/**
* Extract text from an Office document.
* Convert an open document text file to PDF.
*
* @param inputStream Unencrypted input stream
* @return Content extracted
* @return PDF input stream
* @throws Exception
*/
private static String extractOfficeDocument(InputStream inputStream) {
String content = null;
Path tempFile = null;
try {
// Convert the DOCX file to a temporary PDF file
tempFile = Files.createTempFile("sismicsdocs_", ".pdf");
try (OutputStream out = Files.newOutputStream(tempFile)) {
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, out, options);
}
// Extract content from the PDF file
try (InputStream pdfInputStream = Files.newInputStream(tempFile)) {
content = extractPdf(pdfInputStream);
}
} catch (Exception e) {
log.error("Error while extracting text from the DOCX", e);
} finally {
try {
Files.delete(tempFile); // Delete the temporary PDF file
} catch (IOException e) {
// Should not happen
}
}
return content;
private static InputStream convertOpenDocumentText(InputStream inputStream) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
OdfTextDocument document = OdfTextDocument.loadDocument(inputStream);
PdfOptions options = PdfOptions.create();
PdfConverter.getInstance().convert(document, pdfOutputStream, options);
inputStream.reset();
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
/**
* Convert an Office document to PDF.
*
* @param inputStream Unencrypted input stream
* @return PDF input stream
* @throws Exception
*/
private static InputStream convertOfficeDocument(InputStream inputStream) throws Exception {
ByteArrayOutputStream pdfOutputStream = new ByteArrayOutputStream();
XWPFDocument document = new XWPFDocument(inputStream);
org.apache.poi.xwpf.converter.pdf.PdfOptions options = org.apache.poi.xwpf.converter.pdf.PdfOptions.create();
org.apache.poi.xwpf.converter.pdf.PdfConverter.getInstance().convert(document, pdfOutputStream, options);
inputStream.reset();
return new ByteArrayInputStream(pdfOutputStream.toByteArray());
}
/**
* Save a file on the storage filesystem.
*
* @param inputStream Unencrypted input stream
* @param pdf
* @param file File to save
* @param privateKey Private key used for encryption
* @throws Exception
*/
public static void save(InputStream inputStream, File file, String privateKey) throws Exception {
public static void save(InputStream inputStream, InputStream pdfInputStream, File file, String privateKey) throws Exception {
Cipher cipher = EncryptionUtil.getEncryptionCipher(privateKey);
Path path = DirectoryUtil.getStorageDirectory().resolve(file.getId());
Files.copy(new CipherInputStream(inputStream, cipher), path);
inputStream.reset();
// Generate file variations
inputStream.reset();
saveVariations(file, inputStream, cipher);
inputStream.reset();
saveVariations(file, inputStream, pdfInputStream, cipher);
}
/**
@@ -222,25 +209,27 @@ public class FileUtil {
*
* @param file File from database
* @param inputStream Unencrypted input stream
* @param pdfInputStream Unencrypted PDF input stream
* @param cipher Cipher to use for encryption
* @throws Exception
*/
public static void saveVariations(File file, InputStream inputStream, Cipher cipher) throws Exception {
public static void saveVariations(File file, InputStream inputStream, InputStream pdfInputStream, Cipher cipher) throws Exception {
BufferedImage image = null;
if (ImageUtil.isImage(file.getMimeType())) {
image = ImageIO.read(inputStream);
} else if(file.getMimeType().equals(MimeType.APPLICATION_PDF)) {
inputStream.reset();
} else if(pdfInputStream != null) {
// Generate preview from the first page of the PDF
PDDocument pdfDocument = null;
try {
pdfDocument = PDDocument.load(inputStream);
pdfDocument = PDDocument.load(pdfInputStream);
PDFRenderer renderer = new PDFRenderer(pdfDocument);
image = renderer.renderImage(0);
pdfInputStream.reset();
} finally {
pdfDocument.close();
}
}
// TODO Generate thumbnails for DOCX/ODT documents (guess the MIME type earlier and build a PDF version now?)
if (image != null) {
// Generate thumbnails from image

View File

@@ -1,9 +1,11 @@
package com.sismics.docs.core.util;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import junit.framework.Assert;
import org.apache.pdfbox.io.IOUtils;
import org.junit.Test;
import com.google.common.io.Resources;
@@ -18,19 +20,25 @@ import com.sismics.util.mime.MimeType;
public class TestFileUtil {
@Test
public void extractContentOpenDocumentTextTest() throws Exception {
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream()) {
try (InputStream inputStream = Resources.getResource("file/document.odt").openStream();
InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream));
try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) {
Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream));
}
}
}
@Test
public void extractContentOfficeDocumentTest() throws Exception {
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream()) {
try (InputStream inputStream = Resources.getResource("file/document.docx").openStream();
InputStream bytesInputStream = new ByteArrayInputStream(IOUtils.toByteArray(inputStream))) {
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream));
try (InputStream pdfInputStream = FileUtil.convertToPdf(bytesInputStream, file)) {
Assert.assertEquals("Lorem ipsum dolor sit amen.\r\n", FileUtil.extractContent(null, file, inputStream, pdfInputStream));
}
}
}
}