From 586bb63ee708416c8aa343bfaa2d69eeb1e2bc85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8D=92=E9=87=8E=E7=84=A1=E7=87=88?= Date: Wed, 8 May 2019 02:02:12 +0800 Subject: [PATCH] fixup func isBinary to handle CJK runes correctly Former-commit-id: 880817e9e688f7126eb5e3010f5fc37110b28448 [formerly d44d541d75c625a474ca7c8f9adcc52e20ee69e0] [formerly fbbaf7b6a31ed09944700f1ffa98c0baca9ae0f3 [formerly 1c4253952259cd1efe36064fb1d96e782b0d7501]] Former-commit-id: 2a647332f2d80741f7ac9cd4eccffbf8a0dd5348 [formerly ef7355350d4d1276911bcdd8b4aaaf1b82efe50c] Former-commit-id: 7c6d116b6bda492cc9544dc4a46d95cd225c1b39 --- files/file.go | 2 +- files/utils.go | 42 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/files/file.go b/files/file.go index f7253a1e..9e6671e9 100644 --- a/files/file.go +++ b/files/file.go @@ -164,7 +164,7 @@ func (i *FileInfo) detectType(modify, saveContent bool) error { case strings.HasPrefix(mimetype, "image"): i.Type = "image" return nil - case isBinary(string(buffer[:n])) || i.Size > 10*1024*1024: // 10 MB + case isBinary(buffer[:n], n) || i.Size > 10*1024*1024: // 10 MB i.Type = "blob" return nil default: diff --git a/files/utils.go b/files/utils.go index c68e9fbc..98a62500 100644 --- a/files/utils.go +++ b/files/utils.go @@ -1,12 +1,46 @@ package files -func isBinary(content string) bool { - for _, b := range content { - // 65533 is the unknown char +import ( + "unicode/utf8" +) + +func isBinary(content []byte, n int) bool { + maybeStr := string(content) + runeCnt := utf8.RuneCount(content) + runeIndex := 0 + gotRuneErrCnt := 0 + firstRuneErrIndex := -1 + + for _, b := range maybeStr { // 8 and below are control chars (e.g. backspace, null, eof, etc) - if b <= 8 || b == 65533 { + if b <= 8 { return true } + + // 0xFFFD(65533) is the "error" Rune or "Unicode replacement character" + // see https://golang.org/pkg/unicode/utf8/#pkg-constants + if b == 0xFFFD { + //if it is not the last (utf8.UTFMax - x) rune + if runeCnt > utf8.UTFMax && runeIndex < runeCnt-utf8.UTFMax { + return true + } else { + //else it is the last (utf8.UTFMax - x) rune + //there maybe Vxxx, VVxx, VVVx, thus, we may got max 3 0xFFFD rune (asume V is the byte we got) + //for Chinese, it can only be Vxx, VVx, we may got max 2 0xFFFD rune + gotRuneErrCnt++ + + //mark the first time + if firstRuneErrIndex == -1 { + firstRuneErrIndex = runeIndex + } + } + } + runeIndex++ + } + + //if last (utf8.UTFMax - x ) rune has the "error" Rune, but not all + if firstRuneErrIndex != -1 && gotRuneErrCnt != runeCnt-firstRuneErrIndex { + return true } return false }