1
0
mirror of https://github.com/sismics/docs.git synced 2025-12-17 19:51:39 +00:00

Closes #182: format handling refactoring

This commit is contained in:
Benjamin Gamard
2018-03-18 16:16:32 +01:00
parent 996585d7ac
commit 7ea8d0c0f7
16 changed files with 592 additions and 382 deletions

View File

@@ -4,7 +4,9 @@ import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.format.*;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import org.junit.Assert;
import org.junit.Test;
@@ -25,39 +27,40 @@ public class TestFileUtil {
@Test
public void extractContentOpenDocumentTextTest() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
File file = new File();
file.setMimeType(MimeType.OPEN_DOCUMENT_TEXT);
Path pdfPath = PdfUtil.convertToPdf(file, path);
String content = FileUtil.extractContent("eng", file, path, pdfPath);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.odt"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof OdtFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
public void extractContentOfficeDocumentTest() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
File file = new File();
file.setMimeType(MimeType.OFFICE_DOCUMENT);
Path pdfPath = PdfUtil.convertToPdf(file, path);
String content = FileUtil.extractContent("eng", file, path, pdfPath);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "document.docx"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof DocxFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("Lorem ipsum dolor sit amen."));
}
@Test
public void extractContentPdf() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/udhr.pdf").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_PDF);
String content = FileUtil.extractContent("eng", file, path, path);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "udhr.pdf"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
}
@Test
public void extractContentScannedPdf() throws Exception {
Path path = Paths.get(ClassLoader.getSystemResource("file/scanned.pdf").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_PDF);
String content = FileUtil.extractContent("eng", file, path, path);
System.out.println(content);
FormatHandler formatHandler = FormatHandlerUtil.find(MimeTypeUtil.guessMimeType(path, "scanned.pdf"));
Assert.assertNotNull(formatHandler);
Assert.assertTrue(formatHandler instanceof PdfFormatHandler);
String content = formatHandler.extractContent("eng", path);
Assert.assertTrue(content.contains("All human beings are born free and equal in dignity and rights."));
}

View File

@@ -1,6 +1,5 @@
package com.sismics.util;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.mime.MimeType;
import com.sismics.util.mime.MimeTypeUtil;
import org.junit.Assert;
@@ -19,14 +18,10 @@ public class TestMimeTypeUtil {
public void guessOpenDocumentFormatTest() throws Exception {
// Detect ODT files
Path path = Paths.get(ClassLoader.getSystemResource("file/document.odt").toURI());
File file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
Assert.assertEquals(MimeType.OPEN_DOCUMENT_TEXT, MimeTypeUtil.guessMimeType(path, "document.odt"));
// Detect DOCX files
path = Paths.get(ClassLoader.getSystemResource("file/document.docx").toURI());
file = new File();
file.setMimeType(MimeType.APPLICATION_ZIP);
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessOpenDocumentFormat(file, path));
Assert.assertEquals(MimeType.OFFICE_DOCUMENT, MimeTypeUtil.guessMimeType(path, "document.odt"));
}
}