mirror of https://github.com/halo-dev/halo
Refine search result by customizing analyzer (#4456)
#### What type of PR is this? /kind improvement /area core /milestone 2.9.x #### What this PR does / why we need it: - Removes dependency `cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0` due to no significant effect for searching result. - Customize our own analyzer with StandardTokenizer, HTMLStripCharFilter and LowerCaseFilterFactory. Please be aware of that the default field to search has become to `content` instead of `title` + `excerpt` + `content`. If someone wants to search title only, use `title: halo` as query string. For more details, please refer to <https://lucene.apache.org/core/9_5_0/queryparser/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.html>. #### Which issue(s) this PR fixes: Fixes https://github.com/halo-dev/halo/issues/4455 #### Special notes for your reviewer: #### Does this PR introduce a user-facing change? ```release-note 优化本地搜索引擎 ```pull/4490/head
parent
138ffde7e2
commit
e40b5d2388
|
@ -42,7 +42,7 @@ dependencies {
|
||||||
api "org.apache.lucene:lucene-queryparser"
|
api "org.apache.lucene:lucene-queryparser"
|
||||||
api "org.apache.lucene:lucene-highlighter"
|
api "org.apache.lucene:lucene-highlighter"
|
||||||
api "org.apache.lucene:lucene-backward-codecs"
|
api "org.apache.lucene:lucene-backward-codecs"
|
||||||
api 'cn.shenyanchao.ik-analyzer:ik-analyzer'
|
api 'org.apache.lucene:lucene-analysis-common'
|
||||||
|
|
||||||
api "org.apache.commons:commons-lang3"
|
api "org.apache.commons:commons-lang3"
|
||||||
api "io.seruco.encoding:base62"
|
api "io.seruco.encoding:base62"
|
||||||
|
|
|
@ -1,18 +1,21 @@
|
||||||
package run.halo.app.search.post;
|
package run.halo.app.search.post;
|
||||||
|
|
||||||
import static org.apache.commons.lang3.StringUtils.stripToEmpty;
|
|
||||||
import static org.apache.lucene.document.Field.Store.NO;
|
|
||||||
import static org.apache.lucene.document.Field.Store.YES;
|
import static org.apache.lucene.document.Field.Store.YES;
|
||||||
import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND;
|
import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.text.NumberFormat;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.LongPoint;
|
import org.apache.lucene.document.LongPoint;
|
||||||
import org.apache.lucene.document.StoredField;
|
import org.apache.lucene.document.StoredField;
|
||||||
|
@ -22,24 +25,21 @@ import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.queryparser.classic.ParseException;
|
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
|
||||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
|
||||||
|
import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
|
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||||
import org.apache.lucene.search.highlight.Highlighter;
|
import org.apache.lucene.search.highlight.Highlighter;
|
||||||
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
|
|
||||||
import org.apache.lucene.search.highlight.QueryScorer;
|
import org.apache.lucene.search.highlight.QueryScorer;
|
||||||
import org.apache.lucene.search.highlight.SimpleFragmenter;
|
|
||||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.jsoup.safety.Safelist;
|
|
||||||
import org.springframework.beans.factory.DisposableBean;
|
import org.springframework.beans.factory.DisposableBean;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import org.springframework.util.StopWatch;
|
import org.springframework.util.StopWatch;
|
||||||
import org.wltea.analyzer.lucene.IKAnalyzer;
|
|
||||||
import reactor.core.Exceptions;
|
import reactor.core.Exceptions;
|
||||||
import run.halo.app.infra.properties.HaloProperties;
|
import run.halo.app.infra.properties.HaloProperties;
|
||||||
import run.halo.app.search.SearchParam;
|
import run.halo.app.search.SearchParam;
|
||||||
|
@ -49,15 +49,18 @@ import run.halo.app.search.SearchResult;
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class LucenePostSearchService implements PostSearchService, DisposableBean {
|
public class LucenePostSearchService implements PostSearchService, DisposableBean {
|
||||||
|
|
||||||
public static final int MAX_FRAGMENT_SIZE = 100;
|
|
||||||
|
|
||||||
private final Analyzer analyzer;
|
private final Analyzer analyzer;
|
||||||
|
|
||||||
private final Directory postIndexDir;
|
private final Directory postIndexDir;
|
||||||
|
|
||||||
public LucenePostSearchService(HaloProperties haloProperties)
|
public LucenePostSearchService(HaloProperties haloProperties)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
analyzer = new IKAnalyzer(true);
|
analyzer = CustomAnalyzer.builder()
|
||||||
|
.withTokenizer(StandardTokenizerFactory.class)
|
||||||
|
.addCharFilter(HTMLStripCharFilterFactory.NAME)
|
||||||
|
.addTokenFilter(LowerCaseFilterFactory.NAME)
|
||||||
|
.build();
|
||||||
|
|
||||||
var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts");
|
var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts");
|
||||||
postIndexDir = FSDirectory.open(postIdxPath);
|
postIndexDir = FSDirectory.open(postIdxPath);
|
||||||
}
|
}
|
||||||
|
@ -72,14 +75,35 @@ public class LucenePostSearchService implements PostSearchService, DisposableBea
|
||||||
var query = buildQuery(keyword);
|
var query = buildQuery(keyword);
|
||||||
var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE);
|
var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE);
|
||||||
watch.stop();
|
watch.stop();
|
||||||
var highlighter = new Highlighter(
|
|
||||||
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()),
|
|
||||||
new QueryScorer(query));
|
|
||||||
highlighter.setTextFragmenter(new SimpleFragmenter(MAX_FRAGMENT_SIZE));
|
|
||||||
|
|
||||||
|
var formatter =
|
||||||
|
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag());
|
||||||
|
var scorer = new QueryScorer(query);
|
||||||
|
var highlighter = new Highlighter(formatter, new DefaultEncoder(), scorer);
|
||||||
var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length);
|
var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length);
|
||||||
for (var scoreDoc : topDocs.scoreDocs) {
|
for (var scoreDoc : topDocs.scoreDocs) {
|
||||||
hits.add(convert(searcher.storedFields().document(scoreDoc.doc), highlighter));
|
var doc = searcher.storedFields().document(scoreDoc.doc);
|
||||||
|
|
||||||
|
var title = doc.get("title");
|
||||||
|
var titleFragment = highlighter.getBestFragment(analyzer, "title", title);
|
||||||
|
if (titleFragment != null) {
|
||||||
|
title = titleFragment;
|
||||||
|
}
|
||||||
|
|
||||||
|
var content = doc.get("content");
|
||||||
|
var contentFragment = highlighter.getBestFragment(analyzer, "content", content);
|
||||||
|
if (contentFragment != null) {
|
||||||
|
content = contentFragment;
|
||||||
|
}
|
||||||
|
|
||||||
|
var post = new PostHit();
|
||||||
|
post.setName(doc.get("name"));
|
||||||
|
post.setTitle(title);
|
||||||
|
post.setContent(content);
|
||||||
|
var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
|
||||||
|
post.setPublishTimestamp(Instant.ofEpochSecond(publishTimestamp));
|
||||||
|
post.setPermalink(doc.get("permalink"));
|
||||||
|
hits.add(post);
|
||||||
}
|
}
|
||||||
|
|
||||||
var result = new SearchResult<PostHit>();
|
var result = new SearchResult<PostHit>();
|
||||||
|
@ -141,72 +165,29 @@ public class LucenePostSearchService implements PostSearchService, DisposableBea
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Query buildQuery(String keyword) throws ParseException {
|
private Query buildQuery(String keyword) throws QueryNodeException {
|
||||||
if (log.isDebugEnabled()) {
|
if (log.isDebugEnabled()) {
|
||||||
log.debug("Trying to search for keyword: {}", keyword);
|
log.debug("Trying to search for keyword: {}", keyword);
|
||||||
}
|
}
|
||||||
return new QueryParser("searchable", analyzer).parse(keyword);
|
var parser = new StandardQueryParser(analyzer);
|
||||||
|
parser.setPointsConfigMap(Map.of(
|
||||||
|
"publishTimestamp", new PointsConfig(NumberFormat.getNumberInstance(), Long.class)
|
||||||
|
));
|
||||||
|
return parser.parse(keyword, "content");
|
||||||
}
|
}
|
||||||
|
|
||||||
private Document convert(PostDoc post) {
|
private Document convert(PostDoc post) {
|
||||||
var doc = new Document();
|
var doc = new Document();
|
||||||
doc.add(new StringField("name", post.name(), YES));
|
doc.add(new StringField("name", post.name(), YES));
|
||||||
doc.add(new StoredField("title", post.title()));
|
doc.add(new TextField("title", post.title(), YES));
|
||||||
|
doc.add(new TextField("excerpt", post.excerpt(), YES));
|
||||||
|
doc.add(new TextField("content", post.content(), YES));
|
||||||
|
|
||||||
var cleanExcerpt = Jsoup.clean(stripToEmpty(post.excerpt()), Safelist.none());
|
var publishTimestamp = post.publishTimestamp().getEpochSecond();
|
||||||
var cleanContent = Jsoup.clean(stripToEmpty(post.content()), Safelist.none());
|
|
||||||
|
|
||||||
var contentBuilder = new StringBuilder(cleanExcerpt);
|
|
||||||
if (!contentBuilder.isEmpty()) {
|
|
||||||
contentBuilder.append(' ');
|
|
||||||
}
|
|
||||||
contentBuilder.append(cleanContent);
|
|
||||||
|
|
||||||
var content = contentBuilder.toString();
|
|
||||||
|
|
||||||
doc.add(new StoredField("content", content));
|
|
||||||
doc.add(new TextField("searchable", post.title() + " " + content, NO));
|
|
||||||
|
|
||||||
long publishTimestamp = post.publishTimestamp().toEpochMilli();
|
|
||||||
doc.add(new LongPoint("publishTimestamp", publishTimestamp));
|
doc.add(new LongPoint("publishTimestamp", publishTimestamp));
|
||||||
doc.add(new StoredField("publishTimestamp", publishTimestamp));
|
doc.add(new StoredField("publishTimestamp", publishTimestamp));
|
||||||
doc.add(new StoredField("permalink", post.permalink()));
|
doc.add(new StoredField("permalink", post.permalink()));
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
private PostHit convert(Document doc, Highlighter highlighter)
|
|
||||||
throws IOException, InvalidTokenOffsetsException {
|
|
||||||
var post = new PostHit();
|
|
||||||
post.setName(doc.get("name"));
|
|
||||||
|
|
||||||
var title = getHighlightedText(doc, "title", highlighter, MAX_FRAGMENT_SIZE);
|
|
||||||
post.setTitle(title);
|
|
||||||
|
|
||||||
var content = getHighlightedText(doc, "content", highlighter, MAX_FRAGMENT_SIZE);
|
|
||||||
post.setContent(content);
|
|
||||||
|
|
||||||
var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
|
|
||||||
post.setPublishTimestamp(Instant.ofEpochMilli(publishTimestamp));
|
|
||||||
post.setPermalink(doc.get("permalink"));
|
|
||||||
return post;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getHighlightedText(Document doc, String field, Highlighter highlighter,
|
|
||||||
int maxLength)
|
|
||||||
throws InvalidTokenOffsetsException, IOException {
|
|
||||||
try {
|
|
||||||
var highlightedText = highlighter.getBestFragment(analyzer, field, doc.get(field));
|
|
||||||
if (highlightedText != null) {
|
|
||||||
return highlightedText;
|
|
||||||
}
|
|
||||||
} catch (IllegalArgumentException iae) {
|
|
||||||
// TODO we have to ignore the error currently due to no solution about the error.
|
|
||||||
if (!"boost must be a positive float, got -1.0".equals(iae.getMessage())) {
|
|
||||||
throw iae;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// handle if there is not highlighted text
|
|
||||||
var fieldValue = doc.get(field);
|
|
||||||
return StringUtils.substring(fieldValue, 0, maxLength);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ dependencies {
|
||||||
api "org.apache.lucene:lucene-queryparser:$lucene"
|
api "org.apache.lucene:lucene-queryparser:$lucene"
|
||||||
api "org.apache.lucene:lucene-highlighter:$lucene"
|
api "org.apache.lucene:lucene-highlighter:$lucene"
|
||||||
api "org.apache.lucene:lucene-backward-codecs:$lucene"
|
api "org.apache.lucene:lucene-backward-codecs:$lucene"
|
||||||
api 'cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0'
|
api "org.apache.lucene:lucene-analysis-common:$lucene"
|
||||||
|
|
||||||
api "org.apache.commons:commons-lang3:$commonsLang3"
|
api "org.apache.commons:commons-lang3:$commonsLang3"
|
||||||
api "io.seruco.encoding:base62:$base62"
|
api "io.seruco.encoding:base62:$base62"
|
||||||
|
|
Loading…
Reference in New Issue