1
0
mirror of https://github.com/sismics/docs.git synced 2025-12-13 09:46:17 +00:00

#118: extract text content from text plain files (WIP)

This commit is contained in:
Benjamin Gamard
2017-06-11 11:33:30 +02:00
parent dcc7fe55f4
commit 330de495db
5 changed files with 106 additions and 29 deletions

View File

@@ -545,4 +545,63 @@ public class TestDocumentResource extends BaseJerseyTest {
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
}
/**
* Test plain text extraction.
*
* @throws Exception e
*/
@Test
public void testPlainTextExtraction() throws Exception {
// Login document_docx
clientUtil.createUser("document_plain");
String documentPlainToken = clientUtil.login("document_plain");
// Create a document
long create1Date = new Date().getTime();
JsonObject json = target().path("/document").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.put(Entity.form(new Form()
.param("title", "My super title document 1")
.param("description", "My super description for document 1")
.param("language", "eng")
.param("create_date", Long.toString(create1Date))), JsonObject.class);
String document1Id = json.getString("id");
Assert.assertNotNull(document1Id);
// Add a PDF file
String file1Id;
try (InputStream is = Resources.getResource("file/document.txt").openStream()) {
StreamDataBodyPart streamDataBodyPart = new StreamDataBodyPart("file", is, "document.txt");
try (FormDataMultiPart multiPart = new FormDataMultiPart()) {
json = target()
.register(MultiPartFeature.class)
.path("/file").request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.put(Entity.entity(multiPart.field("id", document1Id).bodyPart(streamDataBodyPart),
MediaType.MULTIPART_FORM_DATA_TYPE), JsonObject.class);
file1Id = json.getString("id");
Assert.assertNotNull(file1Id);
}
}
// Search documents by query in full content
json = target().path("/document/list")
.queryParam("search", "full:love")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get(JsonObject.class);
Assert.assertTrue(json.getJsonArray("documents").size() == 1);
// Get the file thumbnail data
Response response = target().path("/file/" + file1Id + "/data")
.queryParam("size", "thumb")
.request()
.cookie(TokenBasedSecurityFilter.COOKIE_NAME, documentPlainToken)
.get();
InputStream is = (InputStream) response.getEntity();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertTrue(fileBytes.length > 0); // Images rendered from PDF differ in size from OS to OS due to font issues
Assert.assertEquals(MimeType.IMAGE_JPEG, MimeTypeUtil.guessMimeType(fileBytes, null));
}
}

View File

@@ -0,0 +1,2 @@
This is a test document
Please love me