fixup func isBinary to handle CJK runes correctly

pull/721/head
荒野無燈 2019-05-08 02:02:12 +08:00
parent 805ad33c1b
commit 1c42539522
2 changed files with 39 additions and 5 deletions

View File

@ -164,7 +164,7 @@ func (i *FileInfo) detectType(modify, saveContent bool) error {
case strings.HasPrefix(mimetype, "image"): case strings.HasPrefix(mimetype, "image"):
i.Type = "image" i.Type = "image"
return nil return nil
case isBinary(string(buffer[:n])) || i.Size > 10*1024*1024: // 10 MB case isBinary(buffer[:n], n) || i.Size > 10*1024*1024: // 10 MB
i.Type = "blob" i.Type = "blob"
return nil return nil
default: default:

View File

@ -1,12 +1,46 @@
package files package files
func isBinary(content string) bool { import (
for _, b := range content { "unicode/utf8"
// 65533 is the unknown char )
func isBinary(content []byte, n int) bool {
maybeStr := string(content)
runeCnt := utf8.RuneCount(content)
runeIndex := 0
gotRuneErrCnt := 0
firstRuneErrIndex := -1
for _, b := range maybeStr {
// 8 and below are control chars (e.g. backspace, null, eof, etc) // 8 and below are control chars (e.g. backspace, null, eof, etc)
if b <= 8 || b == 65533 { if b <= 8 {
return true return true
} }
// 0xFFFD(65533) is the "error" Rune or "Unicode replacement character"
// see https://golang.org/pkg/unicode/utf8/#pkg-constants
if b == 0xFFFD {
//if it is not the last (utf8.UTFMax - x) rune
if runeCnt > utf8.UTFMax && runeIndex < runeCnt-utf8.UTFMax {
return true
} else {
//else it is the last (utf8.UTFMax - x) rune
//there maybe Vxxx, VVxx, VVVx, thus, we may got max 3 0xFFFD rune (asume V is the byte we got)
//for Chinese, it can only be Vxx, VVx, we may got max 2 0xFFFD rune
gotRuneErrCnt++
//mark the first time
if firstRuneErrIndex == -1 {
firstRuneErrIndex = runeIndex
}
}
}
runeIndex++
}
//if last (utf8.UTFMax - x ) rune has the "error" Rune, but not all
if firstRuneErrIndex != -1 && gotRuneErrCnt != runeCnt-firstRuneErrIndex {
return true
} }
return false return false
} }