1
0
mirror of https://github.com/sismics/docs.git synced 2025-12-13 09:46:17 +00:00

Search in OCR content, batch to OCR all files

This commit is contained in:
jendib
2013-08-17 00:36:36 +02:00
parent 1f1f02ed41
commit 82682600df
8 changed files with 101 additions and 28 deletions

View File

@@ -1,9 +1,31 @@
package com.sismics.docs.rest.resource;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import com.sismics.docs.core.dao.jpa.DocumentDao;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.criteria.DocumentCriteria;
import com.sismics.docs.core.dao.jpa.dto.DocumentDto;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.ConfigUtil;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.docs.core.util.jpa.PaginatedList;
import com.sismics.docs.core.util.jpa.PaginatedLists;
import com.sismics.docs.core.util.jpa.SortCriteria;
@@ -13,21 +35,6 @@ import com.sismics.rest.exception.ServerException;
import com.sismics.util.log4j.LogCriteria;
import com.sismics.util.log4j.LogEntry;
import com.sismics.util.log4j.MemoryAppender;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.Logger;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import javax.ws.rs.GET;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.util.ArrayList;
import java.util.List;
import java.util.ResourceBundle;
/**
* General app REST resource.
@@ -129,4 +136,32 @@ public class AppResource extends BaseResource {
return Response.ok().entity(response).build();
}
/**
* OCR-ize all files again.
*
* @return Response
* @throws JSONException
*/
@POST
@Path("batch/ocr")
@Produces(MediaType.APPLICATION_JSON)
public Response batchReindex() throws JSONException {
if (!authenticate()) {
throw new ForbiddenClientException();
}
checkBaseFunction(BaseFunction.ADMIN);
FileDao fileDao = new FileDao();
DocumentDao documentDao = new DocumentDao();
List<File> fileList = fileDao.findAll();
for (File file : fileList) {
Document document = documentDao.getById(file.getDocumentId());
FileUtil.ocrFile(document, file);
}
JSONObject response = new JSONObject();
response.put("status", "ok");
return Response.ok().entity(response).build();
}
}

View File

@@ -23,13 +23,12 @@ public class TestAppResource extends BaseJerseyTest {
*/
@Test
public void testAppResource() throws JSONException {
// Login app1
clientUtil.createUser("app1");
String app1Token = clientUtil.login("app1");
// Login admin
String adminAuthenticationToken = clientUtil.login("admin", "admin", false);
// Check the application info
WebResource appResource = resource().path("/app");
appResource.addFilter(new CookieAuthenticationFilter(app1Token));
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
ClientResponse response = appResource.get(ClientResponse.class);
response = appResource.get(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
@@ -43,6 +42,13 @@ public class TestAppResource extends BaseJerseyTest {
Long totalMemory = json.getLong("total_memory");
Assert.assertTrue(totalMemory > 0 && totalMemory > freeMemory);
Assert.assertEquals(0, json.getInt("document_count"));
// OCR-ize all files
appResource = resource().path("/app/batch/ocr");
appResource.addFilter(new CookieAuthenticationFilter(adminAuthenticationToken));
response = appResource.post(ClientResponse.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
}
/**

View File

@@ -6,14 +6,21 @@ import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status;
import com.sun.jersey.api.client.WebResource;
import com.sun.jersey.core.util.MultivaluedMapImpl;
import com.sun.jersey.multipart.FormDataBodyPart;
import com.sun.jersey.multipart.FormDataMultiPart;
import junit.framework.Assert;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
import org.junit.Test;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Date;
import javax.ws.rs.core.MediaType;
/**
* Exhaustive test of the document resource.
*
@@ -59,6 +66,20 @@ public class TestDocumentResource extends BaseJerseyTest {
String document1Id = json.optString("id");
Assert.assertNotNull(document1Id);
// Add a file
WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(document1Token));
FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE);
form.bodyPart(fdp);
form.field("id", document1Id);
response = fileResource.type(MediaType.MULTIPART_FORM_DATA).put(ClientResponse.class, form);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
json = response.getEntity(JSONObject.class);
// Share this document
WebResource fileShareResource = resource().path("/share");
fileShareResource.addFilter(new CookieAuthenticationFilter(document1Token));
@@ -91,7 +112,7 @@ public class TestDocumentResource extends BaseJerseyTest {
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "Sup");
getParams.putSingle("search", "uranium");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
@@ -154,7 +175,7 @@ public class TestDocumentResource extends BaseJerseyTest {
documentResource = resource().path("/document/list");
documentResource.addFilter(new CookieAuthenticationFilter(document1Token));
getParams = new MultivaluedMapImpl();
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng for");
getParams.putSingle("search", "after:2010 before:2040-08 tag:super shared:yes lang:eng uranium");
response = documentResource.queryParams(getParams).get(ClientResponse.class);
json = response.getEntity(JSONObject.class);
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));

View File

@@ -53,7 +53,7 @@ public class TestFileResource extends BaseJerseyTest {
WebResource fileResource = resource().path("/file");
fileResource.addFilter(new CookieAuthenticationFilter(file1AuthenticationToken));
FormDataMultiPart form = new FormDataMultiPart();
InputStream file = this.getClass().getResourceAsStream("/file/Einstein-Roosevelt-letter.png");
InputStream file = this.getClass().getResourceAsStream("/file/PIA00452.jpg");
FormDataBodyPart fdp = new FormDataBodyPart("file",
new BufferedInputStream(file),
MediaType.APPLICATION_OCTET_STREAM_TYPE);
@@ -88,7 +88,7 @@ public class TestFileResource extends BaseJerseyTest {
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
InputStream is = response.getEntityInputStream();
byte[] fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(292641, fileBytes.length);
Assert.assertEquals(163510, fileBytes.length);
// Get the thumbnail data
fileResource = resource().path("/file/" + file1Id + "/data");
@@ -99,7 +99,7 @@ public class TestFileResource extends BaseJerseyTest {
Assert.assertEquals(Status.OK, Status.fromStatusCode(response.getStatus()));
is = response.getEntityInputStream();
fileBytes = ByteStreams.toByteArray(is);
Assert.assertEquals(34050, fileBytes.length);
Assert.assertEquals(41935, fileBytes.length);
// Get all files from a document
fileResource = resource().path("/file/list");