Refine search result by customizing analyzer (#4456)

#### What type of PR is this?

/kind improvement
/area core
/milestone 2.9.x

#### What this PR does / why we need it:

- Removes dependency `cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0` due to no significant effect for searching result.
- Customize our own analyzer with StandardTokenizer, HTMLStripCharFilter and LowerCaseFilterFactory.

Please be aware of that the default field to search has become to `content` instead of `title` + `excerpt` + `content`. If someone wants to search title only, use `title: halo` as query string. For more details, please refer to <https://lucene.apache.org/core/9_5_0/queryparser/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.html>.

#### Which issue(s) this PR fixes:

Fixes https://github.com/halo-dev/halo/issues/4455

#### Special notes for your reviewer:

#### Does this PR introduce a user-facing change?

```release-note
优化本地搜索引擎
```
pull/4490/head
John Niang 2023-08-25 23:46:12 +08:00 committed by GitHub
parent 138ffde7e2
commit e40b5d2388
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 73 deletions

View File

@ -42,7 +42,7 @@ dependencies {
api "org.apache.lucene:lucene-queryparser" api "org.apache.lucene:lucene-queryparser"
api "org.apache.lucene:lucene-highlighter" api "org.apache.lucene:lucene-highlighter"
api "org.apache.lucene:lucene-backward-codecs" api "org.apache.lucene:lucene-backward-codecs"
api 'cn.shenyanchao.ik-analyzer:ik-analyzer' api 'org.apache.lucene:lucene-analysis-common'
api "org.apache.commons:commons-lang3" api "org.apache.commons:commons-lang3"
api "io.seruco.encoding:base62" api "io.seruco.encoding:base62"

View File

@ -1,18 +1,21 @@
package run.halo.app.search.post; package run.halo.app.search.post;
import static org.apache.commons.lang3.StringUtils.stripToEmpty;
import static org.apache.lucene.document.Field.Store.NO;
import static org.apache.lucene.document.Field.Store.YES; import static org.apache.lucene.document.Field.Store.YES;
import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND; import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND;
import java.io.IOException; import java.io.IOException;
import java.text.NumberFormat;
import java.time.Instant; import java.time.Instant;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Set; import java.util.Set;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StoredField;
@ -22,24 +25,21 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.jsoup.Jsoup;
import org.jsoup.safety.Safelist;
import org.springframework.beans.factory.DisposableBean; import org.springframework.beans.factory.DisposableBean;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.util.StopWatch; import org.springframework.util.StopWatch;
import org.wltea.analyzer.lucene.IKAnalyzer;
import reactor.core.Exceptions; import reactor.core.Exceptions;
import run.halo.app.infra.properties.HaloProperties; import run.halo.app.infra.properties.HaloProperties;
import run.halo.app.search.SearchParam; import run.halo.app.search.SearchParam;
@ -49,15 +49,18 @@ import run.halo.app.search.SearchResult;
@Slf4j @Slf4j
public class LucenePostSearchService implements PostSearchService, DisposableBean { public class LucenePostSearchService implements PostSearchService, DisposableBean {
public static final int MAX_FRAGMENT_SIZE = 100;
private final Analyzer analyzer; private final Analyzer analyzer;
private final Directory postIndexDir; private final Directory postIndexDir;
public LucenePostSearchService(HaloProperties haloProperties) public LucenePostSearchService(HaloProperties haloProperties)
throws IOException { throws IOException {
analyzer = new IKAnalyzer(true); analyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addCharFilter(HTMLStripCharFilterFactory.NAME)
.addTokenFilter(LowerCaseFilterFactory.NAME)
.build();
var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts"); var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts");
postIndexDir = FSDirectory.open(postIdxPath); postIndexDir = FSDirectory.open(postIdxPath);
} }
@ -72,14 +75,35 @@ public class LucenePostSearchService implements PostSearchService, DisposableBea
var query = buildQuery(keyword); var query = buildQuery(keyword);
var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE); var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE);
watch.stop(); watch.stop();
var highlighter = new Highlighter(
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()),
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(MAX_FRAGMENT_SIZE));
var formatter =
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag());
var scorer = new QueryScorer(query);
var highlighter = new Highlighter(formatter, new DefaultEncoder(), scorer);
var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length); var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length);
for (var scoreDoc : topDocs.scoreDocs) { for (var scoreDoc : topDocs.scoreDocs) {
hits.add(convert(searcher.storedFields().document(scoreDoc.doc), highlighter)); var doc = searcher.storedFields().document(scoreDoc.doc);
var title = doc.get("title");
var titleFragment = highlighter.getBestFragment(analyzer, "title", title);
if (titleFragment != null) {
title = titleFragment;
}
var content = doc.get("content");
var contentFragment = highlighter.getBestFragment(analyzer, "content", content);
if (contentFragment != null) {
content = contentFragment;
}
var post = new PostHit();
post.setName(doc.get("name"));
post.setTitle(title);
post.setContent(content);
var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
post.setPublishTimestamp(Instant.ofEpochSecond(publishTimestamp));
post.setPermalink(doc.get("permalink"));
hits.add(post);
} }
var result = new SearchResult<PostHit>(); var result = new SearchResult<PostHit>();
@ -141,72 +165,29 @@ public class LucenePostSearchService implements PostSearchService, DisposableBea
} }
private Query buildQuery(String keyword) throws ParseException { private Query buildQuery(String keyword) throws QueryNodeException {
if (log.isDebugEnabled()) { if (log.isDebugEnabled()) {
log.debug("Trying to search for keyword: {}", keyword); log.debug("Trying to search for keyword: {}", keyword);
} }
return new QueryParser("searchable", analyzer).parse(keyword); var parser = new StandardQueryParser(analyzer);
parser.setPointsConfigMap(Map.of(
"publishTimestamp", new PointsConfig(NumberFormat.getNumberInstance(), Long.class)
));
return parser.parse(keyword, "content");
} }
private Document convert(PostDoc post) { private Document convert(PostDoc post) {
var doc = new Document(); var doc = new Document();
doc.add(new StringField("name", post.name(), YES)); doc.add(new StringField("name", post.name(), YES));
doc.add(new StoredField("title", post.title())); doc.add(new TextField("title", post.title(), YES));
doc.add(new TextField("excerpt", post.excerpt(), YES));
doc.add(new TextField("content", post.content(), YES));
var cleanExcerpt = Jsoup.clean(stripToEmpty(post.excerpt()), Safelist.none()); var publishTimestamp = post.publishTimestamp().getEpochSecond();
var cleanContent = Jsoup.clean(stripToEmpty(post.content()), Safelist.none());
var contentBuilder = new StringBuilder(cleanExcerpt);
if (!contentBuilder.isEmpty()) {
contentBuilder.append(' ');
}
contentBuilder.append(cleanContent);
var content = contentBuilder.toString();
doc.add(new StoredField("content", content));
doc.add(new TextField("searchable", post.title() + " " + content, NO));
long publishTimestamp = post.publishTimestamp().toEpochMilli();
doc.add(new LongPoint("publishTimestamp", publishTimestamp)); doc.add(new LongPoint("publishTimestamp", publishTimestamp));
doc.add(new StoredField("publishTimestamp", publishTimestamp)); doc.add(new StoredField("publishTimestamp", publishTimestamp));
doc.add(new StoredField("permalink", post.permalink())); doc.add(new StoredField("permalink", post.permalink()));
return doc; return doc;
} }
private PostHit convert(Document doc, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException {
var post = new PostHit();
post.setName(doc.get("name"));
var title = getHighlightedText(doc, "title", highlighter, MAX_FRAGMENT_SIZE);
post.setTitle(title);
var content = getHighlightedText(doc, "content", highlighter, MAX_FRAGMENT_SIZE);
post.setContent(content);
var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
post.setPublishTimestamp(Instant.ofEpochMilli(publishTimestamp));
post.setPermalink(doc.get("permalink"));
return post;
}
private String getHighlightedText(Document doc, String field, Highlighter highlighter,
int maxLength)
throws InvalidTokenOffsetsException, IOException {
try {
var highlightedText = highlighter.getBestFragment(analyzer, field, doc.get(field));
if (highlightedText != null) {
return highlightedText;
}
} catch (IllegalArgumentException iae) {
// TODO we have to ignore the error currently due to no solution about the error.
if (!"boost must be a positive float, got -1.0".equals(iae.getMessage())) {
throw iae;
}
}
// handle if there is not highlighted text
var fieldValue = doc.get(field);
return StringUtils.substring(fieldValue, 0, maxLength);
}
} }

View File

@ -39,7 +39,7 @@ dependencies {
api "org.apache.lucene:lucene-queryparser:$lucene" api "org.apache.lucene:lucene-queryparser:$lucene"
api "org.apache.lucene:lucene-highlighter:$lucene" api "org.apache.lucene:lucene-highlighter:$lucene"
api "org.apache.lucene:lucene-backward-codecs:$lucene" api "org.apache.lucene:lucene-backward-codecs:$lucene"
api 'cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0' api "org.apache.lucene:lucene-analysis-common:$lucene"
api "org.apache.commons:commons-lang3:$commonsLang3" api "org.apache.commons:commons-lang3:$commonsLang3"
api "io.seruco.encoding:base62:$base62" api "io.seruco.encoding:base62:$base62"