feat: Optimized word count for non-Chinese articles (#1865)

* feat: word count optimization

Now it can accurately identify the number of words in English articles and mixed language articles with character fieds.

* checkstyle

* delete field charCount

* fix typo and add some complex unit test

* refine unit test

* uniform word count

* fix style
pull/2076/head
yhc 2022-05-06 02:30:15 -05:00 committed by GitHub
parent 15d2f8e56c
commit 508c41b0fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 95 additions and 5 deletions

View File

@ -143,7 +143,7 @@ public class BasePost extends BaseEntity {
private Integer topPriority;
/**
* Likes
* Likes.
*/
@Column(name = "likes")
@ColumnDefault("0")
@ -169,7 +169,7 @@ public class BasePost extends BaseEntity {
private String metaDescription;
/**
* Content word count
* Content word count.
*/
@Column(name = "word_count")
@ColumnDefault("0")
@ -188,6 +188,7 @@ public class BasePost extends BaseEntity {
@Transient
private PatchedContent content;
@Override
public void prePersist() {
super.prePersist();
@ -243,6 +244,7 @@ public class BasePost extends BaseEntity {
if (version == null || version < 0) {
version = 1;
}
// Clear the value of the deprecated attributes
this.originalContent = "";
this.formatContent = "";

View File

@ -64,6 +64,10 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
private static final Pattern BLANK_PATTERN = Pattern.compile("\\s");
private static final String CHINESE_REGEX = "[^\\x00-\\xff]";
private static final String PUNCTUATION_REGEX = "[\\p{P}\\p{S}\\p{Z}\\s]+";
public BasePostServiceImpl(BasePostRepository<POST> basePostRepository,
OptionService optionService,
ContentService contentService,
@ -301,7 +305,6 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
PatchedContent postContent = post.getContent();
// word count stat
post.setWordCount(htmlFormatWordCount(postContent.getContent()));
POST savedPost;
// Create or update post
if (ServiceUtils.isEmptyId(post.getId())) {
@ -484,7 +487,7 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
}
}
// CS304 issue link : https://github.com/halo-dev/halo/issues/1224
// CS304 issue link : https://github.com/halo-dev/halo/issues/1759
/**
* @param htmlContent the markdown style content
@ -498,6 +501,39 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);
String tempString = cleanContent.replaceAll(CHINESE_REGEX, "");
String otherString = cleanContent.replaceAll(CHINESE_REGEX, " ");
int chineseWordCount = cleanContent.length() - tempString.length();
String[] otherWords = otherString.split(PUNCTUATION_REGEX);
int otherWordLength = otherWords.length;
if (otherWordLength > 0 && otherWords[0].length() == 0) {
otherWordLength--;
}
if (otherWords.length > 1 && otherWords[otherWords.length - 1].length() == 0) {
otherWordLength--;
}
return chineseWordCount + otherWordLength;
}
/**
* @param htmlContent the markdown style content
* @return character count except space and line separator
*/
public static long htmlFormatCharacterCount(String htmlContent) {
if (htmlContent == null) {
return 0;
}
String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);
Matcher matcher = BLANK_PATTERN.matcher(cleanContent);
int count = 0;

View File

@ -59,6 +59,20 @@ public class HTMLWordCountTest {
String emptyString = "";
String englishString = "I have a red apple";
String hybridString = "I have a red apple哈哈";
String complexText2 = "HiJessica这个project的schedule有些问题。";
String complexText3 = "The company had a meeting yesterday。Why did you ask for leave";
String complexText4 = "这是一个句子,但是只有中文。";
String complexText5 =
"The wind and the moon are all beautiful, love and hate are all romantic.";
@Test
void pictureTest() {
assertEquals("图片字数测试".length(),
@ -128,4 +142,42 @@ public class HTMLWordCountTest {
assertEquals(0,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(emptyString)));
}
}
@Test
void englishTest() {
assertEquals(5,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(englishString)));
}
@Test
void hybridTest() {
assertEquals(7,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(hybridString)));
}
@Test
void englishCharacterTest() {
assertEquals(14,
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(englishString)));
}
@Test
void hybridCharacterTest() {
assertEquals(16,
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(hybridString)));
}
@Test
void moreComplexTest() {
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText2)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText3)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText4)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText5)));
}
}