Refine search result by customizing analyzer (#4456)

#### What type of PR is this? /kind improvement /area core /milestone 2.9.x #### What this PR does / why we need it: - Removes dependency `cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0` due to no significant effect for searching result. - Customize our own analyzer with StandardTokenizer, HTMLStripCharFilter and LowerCaseFilterFactory. Please be aware of that the default field to search has become to `content` instead of `title` + `excerpt` + `content`. If someone wants to search title only, use `title: halo` as query string. For more details, please refer to <https://lucene.apache.org/core/9_5_0/queryparser/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.html>. #### Which issue(s) this PR fixes: Fixes https://github.com/halo-dev/halo/issues/4455 #### Special notes for your reviewer: #### Does this PR introduce a user-facing change? ```release-note 优化本地搜索引擎 ```
2023-08-25 23:46:12 +08:00 · 2023-08-25 23:46:12 +08:00 · e40b5d2388
parent 138ffde7e2
commit e40b5d2388
3 changed files with 54 additions and 73 deletions
--- a/api/build.gradle
+++ b/api/build.gradle
@ -42,7 +42,7 @@ dependencies {
    api "org.apache.lucene:lucene-queryparser"
    api "org.apache.lucene:lucene-highlighter"
    api "org.apache.lucene:lucene-backward-codecs"
-    api 'cn.shenyanchao.ik-analyzer:ik-analyzer'
+    api 'org.apache.lucene:lucene-analysis-common'
    api "org.apache.commons:commons-lang3"
    api "io.seruco.encoding:base62"
--- a/application/src/main/java/run/halo/app/search/post/LucenePostSearchService.java
+++ b/application/src/main/java/run/halo/app/search/post/LucenePostSearchService.java
@ -1,18 +1,21 @@
 package run.halo.app.search.post;
 import static org.apache.commons.lang3.StringUtils.stripToEmpty;
 import static org.apache.lucene.document.Field.Store.NO;
 import static org.apache.lucene.document.Field.Store.YES;
 import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND;
 import java.io.IOException;
 import java.text.NumberFormat;
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
 import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
 import org.apache.lucene.analysis.custom.CustomAnalyzer;
 import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.StoredField;
@ -22,24 +25,21 @@ import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
-import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
 import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.highlight.DefaultEncoder;
 import org.apache.lucene.search.highlight.Highlighter;
 import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
 import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.SimpleFragmenter;
 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.jsoup.Jsoup;
 import org.jsoup.safety.Safelist;
 import org.springframework.beans.factory.DisposableBean;
 import org.springframework.stereotype.Service;
 import org.springframework.util.StopWatch;
 import org.wltea.analyzer.lucene.IKAnalyzer;
 import reactor.core.Exceptions;
 import run.halo.app.infra.properties.HaloProperties;
 import run.halo.app.search.SearchParam;
@ -49,15 +49,18 @@ import run.halo.app.search.SearchResult;
@Slf4j
 public class LucenePostSearchService implements PostSearchService, DisposableBean {
    public static final int MAX_FRAGMENT_SIZE = 100;
    private final Analyzer analyzer;
    private final Directory postIndexDir;
    public LucenePostSearchService(HaloProperties haloProperties)
        throws IOException {
-        analyzer = new IKAnalyzer(true);
+        analyzer = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addCharFilter(HTMLStripCharFilterFactory.NAME)
            .addTokenFilter(LowerCaseFilterFactory.NAME)
            .build();
        var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts");
        postIndexDir = FSDirectory.open(postIdxPath);
    }
@ -72,14 +75,35 @@ public class LucenePostSearchService implements PostSearchService, DisposableBea
        var query = buildQuery(keyword);
        var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE);
        watch.stop();
        var highlighter = new Highlighter(
            new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()),
            new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(MAX_FRAGMENT_SIZE));
        var formatter =
            new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag());
        var scorer = new QueryScorer(query);
        var highlighter = new Highlighter(formatter, new DefaultEncoder(), scorer);
        var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length);
        for (var scoreDoc : topDocs.scoreDocs) {
-            hits.add(convert(searcher.storedFields().document(scoreDoc.doc), highlighter));
+            var doc = searcher.storedFields().document(scoreDoc.doc);
            var title = doc.get("title");
            var titleFragment = highlighter.getBestFragment(analyzer, "title", title);
            if (titleFragment != null) {
                title = titleFragment;
            }
            var content = doc.get("content");
            var contentFragment = highlighter.getBestFragment(analyzer, "content", content);
            if (contentFragment != null) {
                content = contentFragment;
            }
            var post = new PostHit();
            post.setName(doc.get("name"));
            post.setTitle(title);
            post.setContent(content);
            var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
            post.setPublishTimestamp(Instant.ofEpochSecond(publishTimestamp));
            post.setPermalink(doc.get("permalink"));
            hits.add(post);
        }
        var result = new SearchResult<PostHit>();
@ -141,72 +165,29 @@ public class LucenePostSearchService implements PostSearchService, DisposableBea
    }
-    private Query buildQuery(String keyword) throws ParseException {
+    private Query buildQuery(String keyword) throws QueryNodeException {
        if (log.isDebugEnabled()) {
            log.debug("Trying to search for keyword: {}", keyword);
        }
-        return new QueryParser("searchable", analyzer).parse(keyword);
+        var parser = new StandardQueryParser(analyzer);
        parser.setPointsConfigMap(Map.of(
            "publishTimestamp", new PointsConfig(NumberFormat.getNumberInstance(), Long.class)
        ));
        return parser.parse(keyword, "content");
    }
    private Document convert(PostDoc post) {
        var doc = new Document();
        doc.add(new StringField("name", post.name(), YES));
-        doc.add(new StoredField("title", post.title()));
+        doc.add(new TextField("title", post.title(), YES));
        doc.add(new TextField("excerpt", post.excerpt(), YES));
        doc.add(new TextField("content", post.content(), YES));
-        var cleanExcerpt = Jsoup.clean(stripToEmpty(post.excerpt()), Safelist.none());
+        var publishTimestamp = post.publishTimestamp().getEpochSecond();
        var cleanContent = Jsoup.clean(stripToEmpty(post.content()), Safelist.none());
        var contentBuilder = new StringBuilder(cleanExcerpt);
        if (!contentBuilder.isEmpty()) {
            contentBuilder.append(' ');
        }
        contentBuilder.append(cleanContent);
        var content = contentBuilder.toString();
        doc.add(new StoredField("content", content));
        doc.add(new TextField("searchable", post.title() + " " + content, NO));
        long publishTimestamp = post.publishTimestamp().toEpochMilli();
        doc.add(new LongPoint("publishTimestamp", publishTimestamp));
        doc.add(new StoredField("publishTimestamp", publishTimestamp));
        doc.add(new StoredField("permalink", post.permalink()));
        return doc;
    }
    private PostHit convert(Document doc, Highlighter highlighter)
        throws IOException, InvalidTokenOffsetsException {
        var post = new PostHit();
        post.setName(doc.get("name"));
        var title = getHighlightedText(doc, "title", highlighter, MAX_FRAGMENT_SIZE);
        post.setTitle(title);
        var content = getHighlightedText(doc, "content", highlighter, MAX_FRAGMENT_SIZE);
        post.setContent(content);
        var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
        post.setPublishTimestamp(Instant.ofEpochMilli(publishTimestamp));
        post.setPermalink(doc.get("permalink"));
        return post;
    }
    private String getHighlightedText(Document doc, String field, Highlighter highlighter,
        int maxLength)
        throws InvalidTokenOffsetsException, IOException {
        try {
            var highlightedText = highlighter.getBestFragment(analyzer, field, doc.get(field));
            if (highlightedText != null) {
                return highlightedText;
            }
        } catch (IllegalArgumentException iae) {
            // TODO we have to ignore the error currently due to no solution about the error.
            if (!"boost must be a positive float, got -1.0".equals(iae.getMessage())) {
                throw iae;
            }
        }
        // handle if there is not highlighted text
        var fieldValue = doc.get(field);
        return StringUtils.substring(fieldValue, 0, maxLength);
    }
 }
--- a/platform/application/build.gradle
+++ b/platform/application/build.gradle
@ -39,7 +39,7 @@ dependencies {
        api "org.apache.lucene:lucene-queryparser:$lucene"
        api "org.apache.lucene:lucene-highlighter:$lucene"
        api "org.apache.lucene:lucene-backward-codecs:$lucene"
-        api 'cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0'
+        api "org.apache.lucene:lucene-analysis-common:$lucene"
        api "org.apache.commons:commons-lang3:$commonsLang3"
        api "io.seruco.encoding:base62:$base62"