mirror of
https://github.com/sismics/docs.git
synced 2025-12-15 10:46:26 +00:00
Document language (server), OCR files and store result in database
This commit is contained in:
@@ -1,5 +1,9 @@
|
||||
package com.sismics.docs.core.constant;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* Application constants.
|
||||
*
|
||||
@@ -40,4 +44,9 @@ public class Constants {
|
||||
* Default generic user role.
|
||||
*/
|
||||
public static final String DEFAULT_USER_ROLE = "user";
|
||||
|
||||
/**
|
||||
* Supported document languages.
|
||||
*/
|
||||
public static final List<String> SUPPORTED_LANGUAGES = Lists.newArrayList("eng", "fra");
|
||||
}
|
||||
|
||||
@@ -123,7 +123,7 @@ public class DocumentDao {
|
||||
Map<String, Object> parameterMap = new HashMap<String, Object>();
|
||||
List<String> criteriaList = new ArrayList<String>();
|
||||
|
||||
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, s.SHA_ID_C is not null c4 ");
|
||||
StringBuilder sb = new StringBuilder("select d.DOC_ID_C c0, d.DOC_TITLE_C c1, d.DOC_DESCRIPTION_C c2, d.DOC_CREATEDATE_D c3, d.DOC_LANGUAGE_C c4, s.SHA_ID_C is not null c5 ");
|
||||
sb.append(" from T_DOCUMENT d ");
|
||||
sb.append(" left join T_SHARE s on s.SHA_IDDOCUMENT_C = d.DOC_ID_C and s.SHA_DELETEDATE_D is null ");
|
||||
|
||||
@@ -156,6 +156,10 @@ public class DocumentDao {
|
||||
if (criteria.getShared() != null && criteria.getShared()) {
|
||||
criteriaList.add("s.SHA_ID_C is not null");
|
||||
}
|
||||
if (criteria.getLanguage() != null) {
|
||||
criteriaList.add("d.DOC_LANGUAGE_C = :language");
|
||||
parameterMap.put("language", criteria.getLanguage());
|
||||
}
|
||||
|
||||
criteriaList.add("d.DOC_DELETEDATE_D is null");
|
||||
|
||||
@@ -177,6 +181,7 @@ public class DocumentDao {
|
||||
documentDto.setTitle((String) o[i++]);
|
||||
documentDto.setDescription((String) o[i++]);
|
||||
documentDto.setCreateTimestamp(((Timestamp) o[i++]).getTime());
|
||||
documentDto.setLanguage((String) o[i++]);
|
||||
documentDto.setShared((Boolean) o[i++]);
|
||||
documentDtoList.add(documentDto);
|
||||
}
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
package com.sismics.docs.core.dao.jpa;
|
||||
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import javax.persistence.EntityManager;
|
||||
import javax.persistence.NoResultException;
|
||||
import javax.persistence.Query;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.util.context.ThreadLocalContext;
|
||||
|
||||
/**
|
||||
* File DAO.
|
||||
@@ -66,6 +67,26 @@ public class FileDao {
|
||||
fileDb.setDeleteDate(dateNow);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the content of a file.
|
||||
*
|
||||
* @param file File to update
|
||||
* @return Updated file
|
||||
*/
|
||||
public File updateContent(File file) {
|
||||
EntityManager em = ThreadLocalContext.get().getEntityManager();
|
||||
|
||||
// Get the file
|
||||
Query q = em.createQuery("select f from File f where f.id = :id and f.deleteDate is null");
|
||||
q.setParameter("id", file.getId());
|
||||
File fileFromDb = (File) q.getSingleResult();
|
||||
|
||||
// Update the user
|
||||
fileFromDb.setContent(file.getContent());
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a file by its ID.
|
||||
*
|
||||
|
||||
@@ -40,6 +40,11 @@ public class DocumentCriteria {
|
||||
*/
|
||||
private Boolean shared;
|
||||
|
||||
/**
|
||||
* Language.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* Getter of userId.
|
||||
*
|
||||
@@ -147,4 +152,22 @@ public class DocumentCriteria {
|
||||
public void setShared(Boolean shared) {
|
||||
this.shared = shared;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of language.
|
||||
*
|
||||
* @return the language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of language.
|
||||
*
|
||||
* @param language language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,11 @@ public class DocumentDto {
|
||||
*/
|
||||
private String description;
|
||||
|
||||
/**
|
||||
* Language.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* Creation date.
|
||||
*/
|
||||
@@ -123,4 +128,22 @@ public class DocumentDto {
|
||||
public void setShared(Boolean shared) {
|
||||
this.shared = shared;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of language.
|
||||
*
|
||||
* @return the language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of language.
|
||||
*
|
||||
* @param language language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
package com.sismics.docs.core.dao.lucene;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.TermsFilter;
|
||||
import org.apache.lucene.queryparser.flexible.standard.QueryParserUtil;
|
||||
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import com.sismics.docs.core.model.context.AppContext;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
import com.sismics.docs.core.util.LuceneUtil;
|
||||
import com.sismics.docs.core.util.LuceneUtil.LuceneRunnable;
|
||||
|
||||
/**
|
||||
* Lucene DAO.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class LuceneDao {
|
||||
|
||||
/**
|
||||
* Destroy and rebuild index.
|
||||
*
|
||||
* @param fileList
|
||||
*/
|
||||
public void rebuildIndex(final List<File> fileList) {
|
||||
LuceneUtil.handle(new LuceneRunnable() {
|
||||
@Override
|
||||
public void run(IndexWriter indexWriter) throws Exception {
|
||||
// Empty index
|
||||
indexWriter.deleteAll();
|
||||
|
||||
// Add all files
|
||||
for (File file : fileList) {
|
||||
org.apache.lucene.document.Document document = getDocumentFromFile(file);
|
||||
indexWriter.addDocument(document);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add files to the index.
|
||||
*
|
||||
* @param fileList
|
||||
*/
|
||||
public void create(final List<File> fileList) {
|
||||
LuceneUtil.handle(new LuceneRunnable() {
|
||||
@Override
|
||||
public void run(IndexWriter indexWriter) throws Exception {
|
||||
// Add all files
|
||||
for (File file : fileList) {
|
||||
org.apache.lucene.document.Document document = getDocumentFromFile(file);
|
||||
indexWriter.addDocument(document);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Update index.
|
||||
*
|
||||
* @param fileList File list
|
||||
*/
|
||||
public void update(final List<File> fileList) {
|
||||
LuceneUtil.handle(new LuceneRunnable() {
|
||||
@Override
|
||||
public void run(IndexWriter indexWriter) throws Exception {
|
||||
// Update all files
|
||||
for (File file : fileList) {
|
||||
org.apache.lucene.document.Document document = getDocumentFromFile(file);
|
||||
indexWriter.updateDocument(new Term("id", file.getId()), document);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Search files.
|
||||
*
|
||||
* @param paginatedList
|
||||
* @param feedList
|
||||
* @param searchQuery
|
||||
* @return List of file IDs
|
||||
* @throws Exception
|
||||
*/
|
||||
public List<String> search(String userId, String searchQuery, int limit) throws Exception {
|
||||
// Escape query and add quotes so QueryParser generate a PhraseQuery
|
||||
searchQuery = "\"" + QueryParserUtil.escape(searchQuery) + "\"";
|
||||
|
||||
// Build search query
|
||||
StandardQueryParser qpHelper = new StandardQueryParser(new DocsStandardAnalyzer(Version.LUCENE_42));
|
||||
qpHelper.setPhraseSlop(100000); // PhraseQuery add terms
|
||||
Query contentQuery = qpHelper.parse(searchQuery, "content");
|
||||
|
||||
// Search on file content
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(contentQuery, Occur.SHOULD);
|
||||
|
||||
// Filter on provided user ID
|
||||
List<Term> terms = new ArrayList<Term>();
|
||||
terms.add(new Term("user_id", userId));
|
||||
TermsFilter feedsFilter = new TermsFilter(terms);
|
||||
|
||||
// Search
|
||||
IndexReader reader = DirectoryReader.open(AppContext.getInstance().getLuceneDirectory());
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
TopDocs topDocs = searcher.search(query, feedsFilter, limit);
|
||||
ScoreDoc[] docs = topDocs.scoreDocs;
|
||||
|
||||
// Extract file IDs
|
||||
List<String> fileIdList = new ArrayList<String>();
|
||||
for (int i = 0; i < docs.length; i++) {
|
||||
String id = searcher.doc(docs[i].doc).get("id");
|
||||
fileIdList.add(id);
|
||||
}
|
||||
|
||||
return fileIdList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build Lucene document from file.
|
||||
*
|
||||
* @param file File
|
||||
* @return Document
|
||||
*/
|
||||
private org.apache.lucene.document.Document getDocumentFromFile(File file) {
|
||||
// Building document
|
||||
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
|
||||
document.add(new StringField("id", file.getId(), Field.Store.YES));
|
||||
document.add(new TextField("content", file.getContent(), Field.Store.NO));
|
||||
|
||||
return document;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
package com.sismics.docs.core.event;
|
||||
|
||||
import com.google.common.base.Objects;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
|
||||
/**
|
||||
* New file created event.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FileCreatedAsyncEvent {
|
||||
/**
|
||||
* Created file.
|
||||
*/
|
||||
private File file;
|
||||
|
||||
/**
|
||||
* Document linked to the file.
|
||||
*/
|
||||
private Document document;
|
||||
|
||||
/**
|
||||
* Getter of file.
|
||||
*
|
||||
* @return the file
|
||||
*/
|
||||
public File getFile() {
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of file.
|
||||
*
|
||||
* @param file file
|
||||
*/
|
||||
public void setFile(File file) {
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of document.
|
||||
*
|
||||
* @return the document
|
||||
*/
|
||||
public Document getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of document.
|
||||
*
|
||||
* @param document document
|
||||
*/
|
||||
public void setDocument(Document document) {
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Objects.toStringHelper(this)
|
||||
.add("file", file)
|
||||
.add("document", document)
|
||||
.toString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package com.sismics.docs.core.listener.async;
|
||||
|
||||
import java.text.MessageFormat;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.eventbus.Subscribe;
|
||||
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
|
||||
import com.sismics.docs.core.util.FileUtil;
|
||||
import com.sismics.util.ImageUtil;
|
||||
|
||||
/**
|
||||
* Listener on new file.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FileCreatedAsyncListener {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(FileCreatedAsyncListener.class);
|
||||
|
||||
/**
|
||||
* Process new file.
|
||||
*
|
||||
* @param fileCreatedAsyncEvent New file created event
|
||||
* @throws Exception
|
||||
*/
|
||||
@Subscribe
|
||||
public void onArticleCreated(final FileCreatedAsyncEvent fileCreatedAsyncEvent) throws Exception {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("File created event: " + fileCreatedAsyncEvent.toString());
|
||||
}
|
||||
|
||||
// OCR the file if it is an image
|
||||
if (ImageUtil.isImage(fileCreatedAsyncEvent.getFile().getMimeType())) {
|
||||
long startTime = System.currentTimeMillis();
|
||||
FileUtil.ocrFile(fileCreatedAsyncEvent.getDocument(), fileCreatedAsyncEvent.getFile());
|
||||
log.info(MessageFormat.format("File OCR-ized in {0}ms", System.currentTimeMillis() - startTime));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,15 +1,5 @@
|
||||
package com.sismics.docs.core.model.context;
|
||||
|
||||
import com.google.common.eventbus.AsyncEventBus;
|
||||
import com.google.common.eventbus.EventBus;
|
||||
import com.sismics.docs.core.constant.ConfigType;
|
||||
import com.sismics.docs.core.dao.jpa.ConfigDao;
|
||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||
import com.sismics.docs.core.model.jpa.Config;
|
||||
import com.sismics.docs.core.service.IndexingService;
|
||||
import com.sismics.util.EnvironmentUtil;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@@ -17,6 +7,18 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import com.google.common.eventbus.AsyncEventBus;
|
||||
import com.google.common.eventbus.EventBus;
|
||||
import com.sismics.docs.core.constant.ConfigType;
|
||||
import com.sismics.docs.core.dao.jpa.ConfigDao;
|
||||
import com.sismics.docs.core.listener.async.FileCreatedAsyncListener;
|
||||
import com.sismics.docs.core.listener.sync.DeadEventListener;
|
||||
import com.sismics.docs.core.model.jpa.Config;
|
||||
import com.sismics.docs.core.service.IndexingService;
|
||||
import com.sismics.util.EnvironmentUtil;
|
||||
|
||||
/**
|
||||
* Global application context.
|
||||
*
|
||||
@@ -77,6 +79,7 @@ public class AppContext {
|
||||
asyncExecutorList = new ArrayList<ExecutorService>();
|
||||
|
||||
asyncEventBus = newAsyncEventBus();
|
||||
asyncEventBus.register(new FileCreatedAsyncListener());
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -29,6 +29,12 @@ public class Document {
|
||||
@Column(name = "DOC_IDUSER_C", nullable = false, length = 36)
|
||||
private String userId;
|
||||
|
||||
/**
|
||||
* Language (ISO 639-9).
|
||||
*/
|
||||
@Column(name = "DOC_LANGUAGE_C", nullable = false, length = 3)
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* Title.
|
||||
*/
|
||||
@@ -71,6 +77,24 @@ public class Document {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of language.
|
||||
*
|
||||
* @return the language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of language.
|
||||
*
|
||||
* @param language language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of userId.
|
||||
*
|
||||
|
||||
@@ -5,6 +5,7 @@ import com.google.common.base.Objects;
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.Entity;
|
||||
import javax.persistence.Id;
|
||||
import javax.persistence.Lob;
|
||||
import javax.persistence.Table;
|
||||
import java.util.Date;
|
||||
|
||||
@@ -30,11 +31,18 @@ public class File {
|
||||
private String documentId;
|
||||
|
||||
/**
|
||||
* Document ID.
|
||||
* MIME type.
|
||||
*/
|
||||
@Column(name = "FIL_MIMETYPE_C", length = 100)
|
||||
private String mimeType;
|
||||
|
||||
/**
|
||||
* OCR-ized content.
|
||||
*/
|
||||
@Lob
|
||||
@Column(name = "FIL_CONTENT_C")
|
||||
private String content;
|
||||
|
||||
/**
|
||||
* Creation date.
|
||||
*/
|
||||
@@ -143,6 +151,24 @@ public class File {
|
||||
this.deleteDate = deleteDate;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of content.
|
||||
*
|
||||
* @return the content
|
||||
*/
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter of content.
|
||||
*
|
||||
* @param content content
|
||||
*/
|
||||
public void setContent(String content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter of order.
|
||||
*
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
package com.sismics.docs.core.util;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import net.sourceforge.tess4j.Tesseract;
|
||||
|
||||
import org.imgscalr.Scalr;
|
||||
import org.imgscalr.Scalr.Method;
|
||||
import org.imgscalr.Scalr.Mode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.sismics.docs.core.dao.jpa.FileDao;
|
||||
import com.sismics.docs.core.model.jpa.Document;
|
||||
import com.sismics.docs.core.model.jpa.File;
|
||||
|
||||
/**
|
||||
* File entity utilities.
|
||||
*
|
||||
* @author bgamard
|
||||
*/
|
||||
public class FileUtil {
|
||||
/**
|
||||
* Logger.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);
|
||||
|
||||
/**
|
||||
* OCR a file.
|
||||
*
|
||||
* @param document Document linked to the file
|
||||
* @param file File to OCR
|
||||
*/
|
||||
public static void ocrFile(Document document, final File file) {
|
||||
Tesseract instance = Tesseract.getInstance();
|
||||
java.io.File storedfile = Paths.get(DirectoryUtil.getStorageDirectory().getPath(), file.getId()).toFile();
|
||||
String content = null;
|
||||
BufferedImage image = null;
|
||||
try {
|
||||
image = ImageIO.read(storedfile);
|
||||
} catch (IOException e) {
|
||||
log.error("Error reading the image " + storedfile, e);
|
||||
}
|
||||
|
||||
// Upscale the image if it is too small
|
||||
if (image.getWidth() < 2500 || image.getHeight() < 2500) {
|
||||
BufferedImage resizedImage = Scalr.resize(image, Method.AUTOMATIC, Mode.AUTOMATIC, 3500);
|
||||
image.flush();
|
||||
image = resizedImage;
|
||||
}
|
||||
|
||||
// OCR the file
|
||||
try {
|
||||
instance.setLanguage(document.getLanguage());
|
||||
content = instance.doOCR(image);
|
||||
} catch (Exception e) {
|
||||
log.error("Error while OCR-izing the file " + storedfile, e);
|
||||
}
|
||||
|
||||
file.setContent(content);
|
||||
|
||||
// Store the OCR-ization result in the database
|
||||
TransactionUtil.handle(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
FileDao fileDao = new FileDao();
|
||||
fileDao.updateContent(file);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1 +1 @@
|
||||
db.version=4
|
||||
db.version=5
|
||||
@@ -0,0 +1,3 @@
|
||||
alter table T_FILE add column FIL_CONTENT_C LONGVARCHAR;
|
||||
alter table T_DOCUMENT add column DOC_LANGUAGE_C varchar(3) default 'fra' not null;
|
||||
update T_CONFIG set CFG_VALUE_C='5' where CFG_ID_C='DB_VERSION';
|
||||
Reference in New Issue
Block a user