From 8c37af62077fd2f41c65345fe53f3973c386bb3d Mon Sep 17 00:00:00 2001 From: Benjamin Gamard Date: Fri, 19 Oct 2018 19:13:39 +0200 Subject: [PATCH] #241: search suggestions --- docs-core/pom.xml | 10 +++ .../core/util/indexing/IndexingHandler.java | 3 +- .../util/indexing/LuceneIndexingHandler.java | 82 +++++++++++++++---- .../docs/rest/resource/DocumentResource.java | 12 ++- .../app/docs/controller/document/Document.js | 1 + .../webapp/src/partial/docs/document.html | 5 +- pom.xml | 12 +++ 7 files changed, 104 insertions(+), 21 deletions(-) diff --git a/docs-core/pom.xml b/docs-core/pom.xml index 28071930..ac17fcb4 100644 --- a/docs-core/pom.xml +++ b/docs-core/pom.xml @@ -112,6 +112,16 @@ lucene-queryparser + + org.apache.lucene + lucene-suggest + + + + org.apache.lucene + lucene-highlighter + + com.sun.mail javax.mail diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/indexing/IndexingHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/indexing/IndexingHandler.java index 459caa6c..d1a94f57 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/indexing/IndexingHandler.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/indexing/IndexingHandler.java @@ -100,9 +100,10 @@ public interface IndexingHandler { * Searches documents by criteria. * * @param paginatedList List of documents (updated by side effects) + * @param suggestionList Suggestion of search query (updated by side effects) * @param criteria Search criteria * @param sortCriteria Sort criteria * @throws Exception e */ - void findByCriteria(PaginatedList paginatedList, DocumentCriteria criteria, SortCriteria sortCriteria) throws Exception; + void findByCriteria(PaginatedList paginatedList, List suggestionList, DocumentCriteria criteria, SortCriteria sortCriteria) throws Exception; } diff --git a/docs-core/src/main/java/com/sismics/docs/core/util/indexing/LuceneIndexingHandler.java b/docs-core/src/main/java/com/sismics/docs/core/util/indexing/LuceneIndexingHandler.java index 792ee54b..066ee58d 100644 --- a/docs-core/src/main/java/com/sismics/docs/core/util/indexing/LuceneIndexingHandler.java +++ b/docs-core/src/main/java/com/sismics/docs/core/util/indexing/LuceneIndexingHandler.java @@ -18,6 +18,7 @@ import com.sismics.docs.core.util.jpa.PaginatedLists; import com.sismics.docs.core.util.jpa.QueryParam; import com.sismics.docs.core.util.jpa.SortCriteria; import com.sismics.util.ClasspathScanner; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -26,6 +27,13 @@ import org.apache.lucene.index.*; import org.apache.lucene.queryparser.flexible.standard.QueryParserUtil; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.search.*; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleHTMLEncoder; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.spell.LuceneDictionary; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.FuzzySuggester; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NoLockFactory; import org.apache.lucene.store.RAMDirectory; @@ -207,7 +215,7 @@ public class LuceneIndexingHandler implements IndexingHandler { } @Override - public void findByCriteria(PaginatedList paginatedList, DocumentCriteria criteria, SortCriteria sortCriteria) throws Exception { + public void findByCriteria(PaginatedList paginatedList, List suggestionList, DocumentCriteria criteria, SortCriteria sortCriteria) throws Exception { Map parameterMap = new HashMap<>(); List criteriaList = new ArrayList<>(); @@ -247,6 +255,8 @@ public class LuceneIndexingHandler implements IndexingHandler { } criteriaList.add("d.DOC_ID_C in :documentIdList"); parameterMap.put("documentIdList", documentIdList); + + suggestSearchTerms(criteria.getSearch(), suggestionList); } if (criteria.getCreateDateMin() != null) { criteriaList.add("d.DOC_CREATEDATE_D >= :createDateMin"); @@ -326,6 +336,30 @@ public class LuceneIndexingHandler implements IndexingHandler { paginatedList.setResultList(documentDtoList); } + /** + * Suggest search terms according to the user query. + * + * @param search User search query + * @param suggestionList Suggestion of search query (updated by side effects) + * @throws Exception e + */ + private void suggestSearchTerms(String search, List suggestionList) throws Exception { + DirectoryReader directoryReader = getDirectoryReader(); + if (directoryReader == null) { + return; + } + + FuzzySuggester suggester = new FuzzySuggester(new StandardAnalyzer()); + LuceneDictionary dictionary = new LuceneDictionary(directoryReader, "title"); + suggester.build(dictionary); + int lastIndex = search.lastIndexOf(' '); + String suggestQuery = search.substring(lastIndex < 0 ? 0 : lastIndex); + List lookupResultList = suggester.lookup(suggestQuery, false, 10); + for (Lookup.LookupResult lookupResult : lookupResultList) { + suggestionList.add(lookupResult.key.toString()); + } + } + /** * Fulltext search in files and documents. * @@ -336,27 +370,28 @@ public class LuceneIndexingHandler implements IndexingHandler { */ private Set search(String searchQuery, String fullSearchQuery) throws Exception { // Escape query and add quotes so QueryParser generate a PhraseQuery - searchQuery = "\"" + QueryParserUtil.escape(searchQuery + " " + fullSearchQuery) + "\""; - fullSearchQuery = "\"" + QueryParserUtil.escape(fullSearchQuery) + "\""; + String escapedSearchQuery = "\"" + QueryParserUtil.escape(searchQuery + " " + fullSearchQuery) + "\""; + String escapedFullSearchQuery = "\"" + QueryParserUtil.escape(fullSearchQuery) + "\""; // Build search query - StandardQueryParser qpHelper = new StandardQueryParser(new StandardAnalyzer()); + Analyzer analyzer = new StandardAnalyzer(); + StandardQueryParser qpHelper = new StandardQueryParser(analyzer); qpHelper.setPhraseSlop(100); // PhraseQuery add terms // Search on documents and files BooleanQuery query = new BooleanQuery.Builder() - .add(qpHelper.parse(searchQuery, "title"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "description"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "subject"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "identifier"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "publisher"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "format"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "source"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "type"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "coverage"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "rights"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(searchQuery, "filename"), BooleanClause.Occur.SHOULD) - .add(qpHelper.parse(fullSearchQuery, "content"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "title"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "description"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "subject"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "identifier"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "publisher"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "format"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "source"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "type"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "coverage"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "rights"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedSearchQuery, "filename"), BooleanClause.Occur.SHOULD) + .add(qpHelper.parse(escapedFullSearchQuery, "content"), BooleanClause.Occur.SHOULD) .build(); // Search @@ -370,6 +405,10 @@ public class LuceneIndexingHandler implements IndexingHandler { TopDocs topDocs = searcher.search(query, Integer.MAX_VALUE); ScoreDoc[] docs = topDocs.scoreDocs; + SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("", ""); + SimpleHTMLEncoder simpleHTMLEncoder = new SimpleHTMLEncoder(); + Highlighter highlighter = new Highlighter(simpleHTMLFormatter, simpleHTMLEncoder, new QueryScorer(query)); + // Extract document IDs for (ScoreDoc doc : docs) { org.apache.lucene.document.Document document = searcher.doc(doc.doc); @@ -379,6 +418,15 @@ public class LuceneIndexingHandler implements IndexingHandler { documentId = document.get("id"); } else if (type.equals("file")) { documentId = document.get("document_id"); + + /* + needs full reindexing from previous version to make it work, we now need the file content + String content = document.get("content"); + if (content != null) { + String hl = highlighter.getBestFragment(analyzer, "content", content); + System.out.println(hl); + } + */ } if (documentId != null) { documentIdList.add(documentId); @@ -447,7 +495,7 @@ public class LuceneIndexingHandler implements IndexingHandler { luceneDocument.add(new StringField("document_id", file.getDocumentId(), Field.Store.YES)); } if (file.getContent() != null) { - luceneDocument.add(new TextField("content", file.getContent(), Field.Store.NO)); + luceneDocument.add(new TextField("content", file.getContent(), Field.Store.YES)); } return luceneDocument; diff --git a/docs-web/src/main/java/com/sismics/docs/rest/resource/DocumentResource.java b/docs-web/src/main/java/com/sismics/docs/rest/resource/DocumentResource.java index 78798adc..cd70cb60 100644 --- a/docs-web/src/main/java/com/sismics/docs/rest/resource/DocumentResource.java +++ b/docs-web/src/main/java/com/sismics/docs/rest/resource/DocumentResource.java @@ -367,11 +367,12 @@ public class DocumentResource extends BaseResource { TagDao tagDao = new TagDao(); PaginatedList paginatedList = PaginatedLists.create(limit, offset); + List suggestionList = Lists.newArrayList(); SortCriteria sortCriteria = new SortCriteria(sortColumn, asc); DocumentCriteria documentCriteria = parseSearchQuery(search); documentCriteria.setTargetIdList(getTargetIdList(null)); try { - AppContext.getInstance().getIndexingHandler().findByCriteria(paginatedList, documentCriteria, sortCriteria); + AppContext.getInstance().getIndexingHandler().findByCriteria(paginatedList, suggestionList, documentCriteria, sortCriteria); } catch (Exception e) { throw new ServerException("SearchError", "Error searching in documents", e); } @@ -402,8 +403,15 @@ public class DocumentResource extends BaseResource { .add("file_count", documentDto.getFileCount()) .add("tags", tags)); } + + JsonArrayBuilder suggestions = Json.createArrayBuilder(); + for (String suggestion : suggestionList) { + suggestions.add(suggestion); + } + response.add("total", paginatedList.getResultCount()) - .add("documents", documents); + .add("documents", documents) + .add("suggestions", suggestions); return Response.ok().entity(response.build()).build(); } diff --git a/docs-web/src/main/webapp/src/app/docs/controller/document/Document.js b/docs-web/src/main/webapp/src/app/docs/controller/document/Document.js index 912e309f..7305ed3d 100644 --- a/docs-web/src/main/webapp/src/app/docs/controller/document/Document.js +++ b/docs-web/src/main/webapp/src/app/docs/controller/document/Document.js @@ -38,6 +38,7 @@ angular.module('docs').controller('Document', function ($scope, $rootScope, $tim .then(function (data) { $scope.documents = data.documents; $scope.totalDocuments = data.total; + $scope.suggestions = data.suggestions; }); }; diff --git a/docs-web/src/main/webapp/src/partial/docs/document.html b/docs-web/src/main/webapp/src/partial/docs/document.html index 833102d8..5ec110c3 100644 --- a/docs-web/src/main/webapp/src/partial/docs/document.html +++ b/docs-web/src/main/webapp/src/partial/docs/document.html @@ -19,7 +19,10 @@
- +
diff --git a/pom.xml b/pom.xml index daf7068d..eeae33e3 100644 --- a/pom.xml +++ b/pom.xml @@ -388,6 +388,18 @@ ${org.apache.lucene.version} + + org.apache.lucene + lucene-suggest + ${org.apache.lucene.version} + + + + org.apache.lucene + lucene-highlighter + ${org.apache.lucene.version} + + org.imgscalr imgscalr-lib