mirror of https://github.com/halo-dev/halo
feat: Optimized word count for non-Chinese articles (#1865)
* feat: word count optimization Now it can accurately identify the number of words in English articles and mixed language articles with character fieds. * checkstyle * delete field charCount * fix typo and add some complex unit test * refine unit test * uniform word count * fix stylepull/2076/head
parent
15d2f8e56c
commit
508c41b0fc
|
@ -143,7 +143,7 @@ public class BasePost extends BaseEntity {
|
|||
private Integer topPriority;
|
||||
|
||||
/**
|
||||
* Likes
|
||||
* Likes.
|
||||
*/
|
||||
@Column(name = "likes")
|
||||
@ColumnDefault("0")
|
||||
|
@ -169,7 +169,7 @@ public class BasePost extends BaseEntity {
|
|||
private String metaDescription;
|
||||
|
||||
/**
|
||||
* Content word count
|
||||
* Content word count.
|
||||
*/
|
||||
@Column(name = "word_count")
|
||||
@ColumnDefault("0")
|
||||
|
@ -188,6 +188,7 @@ public class BasePost extends BaseEntity {
|
|||
@Transient
|
||||
private PatchedContent content;
|
||||
|
||||
|
||||
@Override
|
||||
public void prePersist() {
|
||||
super.prePersist();
|
||||
|
@ -243,6 +244,7 @@ public class BasePost extends BaseEntity {
|
|||
if (version == null || version < 0) {
|
||||
version = 1;
|
||||
}
|
||||
|
||||
// Clear the value of the deprecated attributes
|
||||
this.originalContent = "";
|
||||
this.formatContent = "";
|
||||
|
|
|
@ -64,6 +64,10 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
|
|||
|
||||
private static final Pattern BLANK_PATTERN = Pattern.compile("\\s");
|
||||
|
||||
private static final String CHINESE_REGEX = "[^\\x00-\\xff]";
|
||||
|
||||
private static final String PUNCTUATION_REGEX = "[\\p{P}\\p{S}\\p{Z}\\s]+";
|
||||
|
||||
public BasePostServiceImpl(BasePostRepository<POST> basePostRepository,
|
||||
OptionService optionService,
|
||||
ContentService contentService,
|
||||
|
@ -301,7 +305,6 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
|
|||
PatchedContent postContent = post.getContent();
|
||||
// word count stat
|
||||
post.setWordCount(htmlFormatWordCount(postContent.getContent()));
|
||||
|
||||
POST savedPost;
|
||||
// Create or update post
|
||||
if (ServiceUtils.isEmptyId(post.getId())) {
|
||||
|
@ -484,7 +487,7 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
|
|||
}
|
||||
}
|
||||
|
||||
// CS304 issue link : https://github.com/halo-dev/halo/issues/1224
|
||||
// CS304 issue link : https://github.com/halo-dev/halo/issues/1759
|
||||
|
||||
/**
|
||||
* @param htmlContent the markdown style content
|
||||
|
@ -498,6 +501,39 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
|
|||
|
||||
String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);
|
||||
|
||||
String tempString = cleanContent.replaceAll(CHINESE_REGEX, "");
|
||||
|
||||
String otherString = cleanContent.replaceAll(CHINESE_REGEX, " ");
|
||||
|
||||
int chineseWordCount = cleanContent.length() - tempString.length();
|
||||
|
||||
String[] otherWords = otherString.split(PUNCTUATION_REGEX);
|
||||
|
||||
int otherWordLength = otherWords.length;
|
||||
|
||||
if (otherWordLength > 0 && otherWords[0].length() == 0) {
|
||||
otherWordLength--;
|
||||
}
|
||||
|
||||
if (otherWords.length > 1 && otherWords[otherWords.length - 1].length() == 0) {
|
||||
otherWordLength--;
|
||||
}
|
||||
|
||||
return chineseWordCount + otherWordLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param htmlContent the markdown style content
|
||||
* @return character count except space and line separator
|
||||
*/
|
||||
|
||||
public static long htmlFormatCharacterCount(String htmlContent) {
|
||||
if (htmlContent == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);
|
||||
|
||||
Matcher matcher = BLANK_PATTERN.matcher(cleanContent);
|
||||
|
||||
int count = 0;
|
||||
|
|
|
@ -59,6 +59,20 @@ public class HTMLWordCountTest {
|
|||
|
||||
String emptyString = "";
|
||||
|
||||
String englishString = "I have a red apple";
|
||||
|
||||
String hybridString = "I have a red apple哈哈";
|
||||
|
||||
|
||||
String complexText2 = "Hi,Jessica!这个project的schedule有些问题。";
|
||||
|
||||
String complexText3 = "The company had a meeting yesterday。Why did you ask for leave?";
|
||||
|
||||
String complexText4 = "这是一个句子,但是只有中文。";
|
||||
|
||||
String complexText5 =
|
||||
"The wind and the moon are all beautiful, love and hate are all romantic.";
|
||||
|
||||
@Test
|
||||
void pictureTest() {
|
||||
assertEquals("图片字数测试".length(),
|
||||
|
@ -128,4 +142,42 @@ public class HTMLWordCountTest {
|
|||
assertEquals(0,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(emptyString)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void englishTest() {
|
||||
assertEquals(5,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(englishString)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void hybridTest() {
|
||||
assertEquals(7,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(hybridString)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void englishCharacterTest() {
|
||||
assertEquals(14,
|
||||
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(englishString)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void hybridCharacterTest() {
|
||||
assertEquals(16,
|
||||
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(hybridString)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void moreComplexTest() {
|
||||
assertEquals(14,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText2)));
|
||||
assertEquals(14,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText3)));
|
||||
assertEquals(14,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText4)));
|
||||
assertEquals(14,
|
||||
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText5)));
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue