1
0
mirror of https://github.com/sismics/docs.git synced 2025-12-15 10:46:26 +00:00

Document language (server), OCR files and store result in database

This commit is contained in:
jendib
2013-08-16 23:48:35 +02:00
parent 70a86dc86f
commit 1f1f02ed41
29 changed files with 670 additions and 27 deletions

View File

@@ -1,5 +1,9 @@
package com.sismics.docs.core.constant;
import java.util.List;
import com.google.common.collect.Lists;
/**
* Application constants.
*
@@ -40,4 +44,9 @@ public class Constants {
* Default generic user role.
*/
public static final String DEFAULT_USER_ROLE = "user";
/**
* Supported document languages.
*/
public static final List<String> SUPPORTED_LANGUAGES = Lists.newArrayList("eng", "fra");
}

View File

@@ -123,7 +123,7 @@ public class DocumentDao {
Map<String, Object> parameterMap = new HashMap<String, Object>();
List<String> criteriaList = new ArrayList<String>();
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, s.SHA_ID_C is not null c4 ");
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
sb.append(" from T_DOCUMENT d ");
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
@@ -156,6 +156,10 @@ public class DocumentDao {
if (criteria.getShared() != null && criteria.getShared()) {
criteriaList.add("s.SHA_ID_C is not null");
}
if (criteria.getLanguage() != null) {
criteriaList.add("d.DOC_LANGUAGE_C = :language");
parameterMap.put("language", criteria.getLanguage());
}
criteriaList.add("d.DOC_DELETEDATE_D is null");
@@ -177,6 +181,7 @@ public class DocumentDao {
documentDto.setTitle((String) o[i++]);
documentDto.setDescription((String) o[i++]);
documentDto.setCreateTimestamp(((Timestamp) o[i++]).getTime());
documentDto.setLanguage((String) o[i++]);
documentDto.setShared((Boolean) o[i++]);
documentDtoList.add(documentDto);
}

View File

@@ -1,14 +1,15 @@
package com.sismics.docs.core.dao.jpa;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.context.ThreadLocalContext;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import javax.persistence.EntityManager;
import javax.persistence.NoResultException;
import javax.persistence.Query;
import java.util.Date;
import java.util.List;
import java.util.UUID;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.util.context.ThreadLocalContext;
/**
* File DAO.
@@ -66,6 +67,26 @@ public class FileDao {
fileDb.setDeleteDate(dateNow);
}
/**
* Updates the content of a file.
*
* @param file File to update
* @return Updated file
*/
public File updateContent(File file) {
EntityManager em = ThreadLocalContext.get().getEntityManager();
// Get the file
Query q = em.createQuery("select f from File f where f.id = :id and f.deleteDate is null");
q.setParameter("id", file.getId());
File fileFromDb = (File) q.getSingleResult();
// Update the user
fileFromDb.setContent(file.getContent());
return file;
}
/**
* Gets a file by its ID.
*

View File

@@ -40,6 +40,11 @@ public class DocumentCriteria {
*/
private Boolean shared;
/**
* Language.
*/
private String language;
/**
* Getter of userId.
*
@@ -147,4 +152,22 @@ public class DocumentCriteria {
public void setShared(Boolean shared) {
this.shared = shared;
}
/**
* Getter of language.
*
* @return the language
*/
public String getLanguage() {
return language;
}
/**
* Setter of language.
*
* @param language language
*/
public void setLanguage(String language) {
this.language = language;
}
}

View File

@@ -24,6 +24,11 @@ public class DocumentDto {
*/
private String description;
/**
* Language.
*/
private String language;
/**
* Creation date.
*/
@@ -123,4 +128,22 @@ public class DocumentDto {
public void setShared(Boolean shared) {
this.shared = shared;
}
/**
* Getter of language.
*
* @return the language
*/
public String getLanguage() {
return language;
}
/**
* Setter of language.
*
* @param language language
*/
public void setLanguage(String language) {
this.language = language;
}
}

View File

@@ -0,0 +1,151 @@
package com.sismics.docs.core.dao.lucene;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.queryparser.flexible.standard.QueryParserUtil;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Version;
import com.sismics.docs.core.model.context.AppContext;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.util.LuceneUtil;
import com.sismics.docs.core.util.LuceneUtil.LuceneRunnable;
/**
* Lucene DAO.
*
* @author bgamard
*/
public class LuceneDao {
/**
* Destroy and rebuild index.
*
* @param fileList
*/
public void rebuildIndex(final List<File> fileList) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
// Empty index
indexWriter.deleteAll();
// Add all files
for (File file : fileList) {
org.apache.lucene.document.Document document = getDocumentFromFile(file);
indexWriter.addDocument(document);
}
}
});
}
/**
* Add files to the index.
*
* @param fileList
*/
public void create(final List<File> fileList) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
// Add all files
for (File file : fileList) {
org.apache.lucene.document.Document document = getDocumentFromFile(file);
indexWriter.addDocument(document);
}
}
});
}
/**
* Update index.
*
* @param fileList File list
*/
public void update(final List<File> fileList) {
LuceneUtil.handle(new LuceneRunnable() {
@Override
public void run(IndexWriter indexWriter) throws Exception {
// Update all files
for (File file : fileList) {
org.apache.lucene.document.Document document = getDocumentFromFile(file);
indexWriter.updateDocument(new Term("id", file.getId()), document);
}
}
});
}
/**
* Search files.
*
* @param paginatedList
* @param feedList
* @param searchQuery
* @return List of file IDs
* @throws Exception
*/
public List<String> search(String userId, String searchQuery, int limit) throws Exception {
// Escape query and add quotes so QueryParser generate a PhraseQuery
searchQuery = "\"" + QueryParserUtil.escape(searchQuery) + "\"";
// Build search query
StandardQueryParser qpHelper = new StandardQueryParser(new DocsStandardAnalyzer(Version.LUCENE_42));
qpHelper.setPhraseSlop(100000); // PhraseQuery add terms
Query contentQuery = qpHelper.parse(searchQuery, "content");
// Search on file content
BooleanQuery query = new BooleanQuery();
query.add(contentQuery, Occur.SHOULD);
// Filter on provided user ID
List<Term> terms = new ArrayList<Term>();
terms.add(new Term("user_id", userId));
TermsFilter feedsFilter = new TermsFilter(terms);
// Search
IndexReader reader = DirectoryReader.open(AppContext.getInstance().getLuceneDirectory());
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs topDocs = searcher.search(query, feedsFilter, limit);
ScoreDoc[] docs = topDocs.scoreDocs;
// Extract file IDs
List<String> fileIdList = new ArrayList<String>();
for (int i = 0; i < docs.length; i++) {
String id = searcher.doc(docs[i].doc).get("id");
fileIdList.add(id);
}
return fileIdList;
}
/**
* Build Lucene document from file.
*
* @param file File
* @return Document
*/
private org.apache.lucene.document.Document getDocumentFromFile(File file) {
// Building document
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
document.add(new StringField("id", file.getId(), Field.Store.YES));
document.add(new TextField("content", file.getContent(), Field.Store.NO));
return document;
}
}

View File

@@ -0,0 +1,66 @@
package com.sismics.docs.core.event;
import com.google.common.base.Objects;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
/**
* New file created event.
*
* @author bgamard
*/
public class FileCreatedAsyncEvent {
/**
* Created file.
*/
private File file;
/**
* Document linked to the file.
*/
private Document document;
/**
* Getter of file.
*
* @return the file
*/
public File getFile() {
return file;
}
/**
* Setter of file.
*
* @param file file
*/
public void setFile(File file) {
this.file = file;
}
/**
* Getter of document.
*
* @return the document
*/
public Document getDocument() {
return document;
}
/**
* Setter of document.
*
* @param document document
*/
public void setDocument(Document document) {
this.document = document;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("file", file)
.add("document", document)
.toString();
}
}

View File

@@ -0,0 +1,43 @@
package com.sismics.docs.core.listener.async;
import java.text.MessageFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.util.FileUtil;
import com.sismics.util.ImageUtil;
/**
* Listener on new file.
*
* @author bgamard
*/
public class FileCreatedAsyncListener {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(FileCreatedAsyncListener.class);
/**
* Process new file.
*
* @param fileCreatedAsyncEvent New file created event
* @throws Exception
*/
@Subscribe
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
if (log.isInfoEnabled()) {
log.info("File created event: " + fileCreatedAsyncEvent.toString());
}
// OCR the file if it is an image
if (ImageUtil.isImage(fileCreatedAsyncEvent.getFile().getMimeType())) {
long startTime = System.currentTimeMillis();
FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), fileCreatedAsyncEvent.getFile());
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
}
}
}

View File

@@ -1,15 +1,5 @@
package com.sismics.docs.core.model.context;
import com.google.common.eventbus.AsyncEventBus;
import com.google.common.eventbus.EventBus;
import com.sismics.docs.core.constant.ConfigType;
import com.sismics.docs.core.dao.jpa.ConfigDao;
import com.sismics.docs.core.listener.sync.DeadEventListener;
import com.sismics.docs.core.model.jpa.Config;
import com.sismics.docs.core.service.IndexingService;
import com.sismics.util.EnvironmentUtil;
import org.apache.lucene.store.Directory;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
@@ -17,6 +7,18 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.store.Directory;
import com.google.common.eventbus.AsyncEventBus;
import com.google.common.eventbus.EventBus;
import com.sismics.docs.core.constant.ConfigType;
import com.sismics.docs.core.dao.jpa.ConfigDao;
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
import com.sismics.docs.core.listener.sync.DeadEventListener;
import com.sismics.docs.core.model.jpa.Config;
import com.sismics.docs.core.service.IndexingService;
import com.sismics.util.EnvironmentUtil;
/**
* Global application context.
*
@@ -77,6 +79,7 @@ public class AppContext {
asyncExecutorList = new ArrayList<ExecutorService>();
asyncEventBus = newAsyncEventBus();
asyncEventBus.register(new FileCreatedAsyncListener());
}
/**

View File

@@ -29,6 +29,12 @@ public class Document {
@Column(name = "DOC_IDUSER_C", nullable = false, length = 36)
private String userId;
/**
* Language (ISO 639-9).
*/
@Column(name = "DOC_LANGUAGE_C", nullable = false, length = 3)
private String language;
/**
* Title.
*/
@@ -71,6 +77,24 @@ public class Document {
this.id = id;
}
/**
* Getter of language.
*
* @return the language
*/
public String getLanguage() {
return language;
}
/**
* Setter of language.
*
* @param language language
*/
public void setLanguage(String language) {
this.language = language;
}
/**
* Getter of userId.
*

View File

@@ -5,6 +5,7 @@ import com.google.common.base.Objects;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Lob;
import javax.persistence.Table;
import java.util.Date;
@@ -30,11 +31,18 @@ public class File {
private String documentId;
/**
* Document ID.
* MIME type.
*/
@Column(name = "FIL_MIMETYPE_C", length = 100)
private String mimeType;
/**
* OCR-ized content.
*/
@Lob
@Column(name = "FIL_CONTENT_C")
private String content;
/**
* Creation date.
*/
@@ -143,6 +151,24 @@ public class File {
this.deleteDate = deleteDate;
}
/**
* Getter of content.
*
* @return the content
*/
public String getContent() {
return content;
}
/**
* Setter of content.
*
* @param content content
*/
public void setContent(String content) {
this.content = content;
}
/**
* Getter of order.
*

View File

@@ -0,0 +1,75 @@
package com.sismics.docs.core.util;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.file.Paths;
import javax.imageio.ImageIO;
import net.sourceforge.tess4j.Tesseract;
import org.imgscalr.Scalr;
import org.imgscalr.Scalr.Method;
import org.imgscalr.Scalr.Mode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.model.jpa.Document;
import com.sismics.docs.core.model.jpa.File;
/**
* File entity utilities.
*
* @author bgamard
*/
public class FileUtil {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
/**
* OCR a file.
*
* @param document Document linked to the file
* @param file File to OCR
*/
public static void ocrFile(Document document, final File file) {
Tesseract instance = Tesseract.getInstance();
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
String content = null;
BufferedImage image = null;
try {
image = ImageIO.read(storedfile);
} catch (IOException e) {
log.error("Error reading the image " + storedfile, e);
}
// Upscale the image if it is too small
if (image.getWidth() < 2500 || image.getHeight() < 2500) {
BufferedImage resizedImage = Scalr.resize(image, Method.AUTOMATIC, Mode.AUTOMATIC, 3500);
image.flush();
image = resizedImage;
}
// OCR the file
try {
instance.setLanguage(document.getLanguage());
content = instance.doOCR(image);
} catch (Exception e) {
log.error("Error while OCR-izing the file " + storedfile, e);
}
file.setContent(content);
// Store the OCR-ization result in the database
TransactionUtil.handle(new Runnable() {
@Override
public void run() {
FileDao fileDao = new FileDao();
fileDao.updateContent(file);
}
});
}
}

View File

@@ -1 +1 @@
db.version=4
db.version=5

View File

@@ -0,0 +1,3 @@
alter table T_FILE add column FIL_CONTENT_C LONGVARCHAR;
alter table T_DOCUMENT add column DOC_LANGUAGE_C varchar(3) default 'fra' not null;
update T_CONFIG set CFG_VALUE_C='5' where CFG_ID_C='DB_VERSION';