feat: Optimized word count for non-Chinese articles (#1865)

* feat: word count optimization

Now it can accurately identify the number of words in English articles and mixed language articles with character fieds.

* checkstyle

* delete field charCount

* fix typo and add some complex unit test

* refine unit test

* uniform word count

* fix style
pull/2076/head
yhc 2022-05-06 02:30:15 -05:00 committed by GitHub
parent 15d2f8e56c
commit 508c41b0fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 95 additions and 5 deletions

View File

@ -143,7 +143,7 @@ public class BasePost extends BaseEntity {
private Integer topPriority; private Integer topPriority;
/** /**
* Likes * Likes.
*/ */
@Column(name = "likes") @Column(name = "likes")
@ColumnDefault("0") @ColumnDefault("0")
@ -169,7 +169,7 @@ public class BasePost extends BaseEntity {
private String metaDescription; private String metaDescription;
/** /**
* Content word count * Content word count.
*/ */
@Column(name = "word_count") @Column(name = "word_count")
@ColumnDefault("0") @ColumnDefault("0")
@ -188,6 +188,7 @@ public class BasePost extends BaseEntity {
@Transient @Transient
private PatchedContent content; private PatchedContent content;
@Override @Override
public void prePersist() { public void prePersist() {
super.prePersist(); super.prePersist();
@ -243,6 +244,7 @@ public class BasePost extends BaseEntity {
if (version == null || version < 0) { if (version == null || version < 0) {
version = 1; version = 1;
} }
// Clear the value of the deprecated attributes // Clear the value of the deprecated attributes
this.originalContent = ""; this.originalContent = "";
this.formatContent = ""; this.formatContent = "";

View File

@ -64,6 +64,10 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
private static final Pattern BLANK_PATTERN = Pattern.compile("\\s"); private static final Pattern BLANK_PATTERN = Pattern.compile("\\s");
private static final String CHINESE_REGEX = "[^\\x00-\\xff]";
private static final String PUNCTUATION_REGEX = "[\\p{P}\\p{S}\\p{Z}\\s]+";
public BasePostServiceImpl(BasePostRepository<POST> basePostRepository, public BasePostServiceImpl(BasePostRepository<POST> basePostRepository,
OptionService optionService, OptionService optionService,
ContentService contentService, ContentService contentService,
@ -301,7 +305,6 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
PatchedContent postContent = post.getContent(); PatchedContent postContent = post.getContent();
// word count stat // word count stat
post.setWordCount(htmlFormatWordCount(postContent.getContent())); post.setWordCount(htmlFormatWordCount(postContent.getContent()));
POST savedPost; POST savedPost;
// Create or update post // Create or update post
if (ServiceUtils.isEmptyId(post.getId())) { if (ServiceUtils.isEmptyId(post.getId())) {
@ -484,7 +487,7 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
} }
} }
// CS304 issue link : https://github.com/halo-dev/halo/issues/1224 // CS304 issue link : https://github.com/halo-dev/halo/issues/1759
/** /**
* @param htmlContent the markdown style content * @param htmlContent the markdown style content
@ -498,6 +501,39 @@ public abstract class BasePostServiceImpl<POST extends BasePost>
String cleanContent = HaloUtils.cleanHtmlTag(htmlContent); String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);
String tempString = cleanContent.replaceAll(CHINESE_REGEX, "");
String otherString = cleanContent.replaceAll(CHINESE_REGEX, " ");
int chineseWordCount = cleanContent.length() - tempString.length();
String[] otherWords = otherString.split(PUNCTUATION_REGEX);
int otherWordLength = otherWords.length;
if (otherWordLength > 0 && otherWords[0].length() == 0) {
otherWordLength--;
}
if (otherWords.length > 1 && otherWords[otherWords.length - 1].length() == 0) {
otherWordLength--;
}
return chineseWordCount + otherWordLength;
}
/**
* @param htmlContent the markdown style content
* @return character count except space and line separator
*/
public static long htmlFormatCharacterCount(String htmlContent) {
if (htmlContent == null) {
return 0;
}
String cleanContent = HaloUtils.cleanHtmlTag(htmlContent);
Matcher matcher = BLANK_PATTERN.matcher(cleanContent); Matcher matcher = BLANK_PATTERN.matcher(cleanContent);
int count = 0; int count = 0;

View File

@ -59,6 +59,20 @@ public class HTMLWordCountTest {
String emptyString = ""; String emptyString = "";
String englishString = "I have a red apple";
String hybridString = "I have a red apple哈哈";
String complexText2 = "HiJessica这个project的schedule有些问题。";
String complexText3 = "The company had a meeting yesterday。Why did you ask for leave";
String complexText4 = "这是一个句子,但是只有中文。";
String complexText5 =
"The wind and the moon are all beautiful, love and hate are all romantic.";
@Test @Test
void pictureTest() { void pictureTest() {
assertEquals("图片字数测试".length(), assertEquals("图片字数测试".length(),
@ -128,4 +142,42 @@ public class HTMLWordCountTest {
assertEquals(0, assertEquals(0,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(emptyString))); BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(emptyString)));
} }
}
@Test
void englishTest() {
assertEquals(5,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(englishString)));
}
@Test
void hybridTest() {
assertEquals(7,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(hybridString)));
}
@Test
void englishCharacterTest() {
assertEquals(14,
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(englishString)));
}
@Test
void hybridCharacterTest() {
assertEquals(16,
BasePostServiceImpl.htmlFormatCharacterCount(MarkdownUtils.renderHtml(hybridString)));
}
@Test
void moreComplexTest() {
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText2)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText3)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText4)));
assertEquals(14,
BasePostServiceImpl.htmlFormatWordCount(MarkdownUtils.renderHtml(complexText5)));
}
}