diff --git a/jodconverter-web/src/main/java/com/yudianbank/utils/DownloadUtils.java b/jodconverter-web/src/main/java/com/yudianbank/utils/DownloadUtils.java index 1be2f8f0..33b70c4e 100644 --- a/jodconverter-web/src/main/java/com/yudianbank/utils/DownloadUtils.java +++ b/jodconverter-web/src/main/java/com/yudianbank/utils/DownloadUtils.java @@ -5,6 +5,7 @@ import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import java.io.*; import java.net.*; +import java.nio.charset.Charset; import java.util.UUID; /** @@ -71,6 +72,10 @@ public class DownloadUtils { response.setContent(realPath); // 同样针对类txt文件,如果成功msg包含的是转换后的文件名 response.setMsg(fileName); + + // 转换文件编码为utf8 + convertTextPlainFileCharsetToUtf8(realPath); + return response; } catch (IOException e) { e.printStackTrace(); @@ -152,4 +157,40 @@ public class DownloadUtils { } return newType; } + + /** + * 转换文本文件编码为utf8 + * 探测源文件编码,探测到编码切不为utf8则进行转码 + * @param filePath 文件路径 + */ + private static void convertTextPlainFileCharsetToUtf8(String filePath) throws IOException { + File sourceFile = new File(filePath); + if(sourceFile.exists() && sourceFile.isFile() && sourceFile.canRead()) { + String encoding = null; + try { + FileCharsetDetector.Observer observer = FileCharsetDetector.guessFileEncoding(sourceFile); + encoding = observer.getEncoding(); + } catch (IOException e) { + // 编码探测失败, + e.printStackTrace(); + } + if(encoding != null && !"UTF-8".equals(encoding)){ + // 不为utf8,进行转码 + File tmpUtf8File = new File(filePath+".utf8"); + Writer writer = new OutputStreamWriter(new FileOutputStream(tmpUtf8File),"UTF-8"); + Reader reader = new BufferedReader(new InputStreamReader(new FileInputStream(sourceFile),encoding)); + char[] buf = new char[1024]; + int read; + while ((read = reader.read(buf)) > 0){ + writer.write(buf, 0, read); + } + reader.close(); + writer.close(); + // 删除源文件 + sourceFile.delete(); + // 重命名 + tmpUtf8File.renameTo(sourceFile); + } + } + } } diff --git a/jodconverter-web/src/main/java/com/yudianbank/utils/FileCharsetDetector.java b/jodconverter-web/src/main/java/com/yudianbank/utils/FileCharsetDetector.java new file mode 100644 index 00000000..b08f758d --- /dev/null +++ b/jodconverter-web/src/main/java/com/yudianbank/utils/FileCharsetDetector.java @@ -0,0 +1,157 @@ +package com.yudianbank.utils; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.mozilla.intl.chardet.nsDetector; +import org.mozilla.intl.chardet.nsICharsetDetectionObserver; + +/** + * 文本文件编码探测工具类 + * + * @author HWliao + * @date 2017-12-24 + */ +public class FileCharsetDetector { + + /** + * 传入一个文件(File)对象,检查文件编码 + * + * @param file File对象实例 + * @return 文件编码,若无,则返回null + * @throws FileNotFoundException + * @throws IOException + */ + public static Observer guessFileEncoding(File file) + throws FileNotFoundException, IOException { + return guessFileEncoding(file, new nsDetector()); + } + + /** + *
+   * 获取文件的编码
+   * @param file
+   *            File对象实例
+   * @param languageHint
+   *            语言提示区域代码 @see #nsPSMDetector ,取值如下:
+   *             1 : Japanese
+   *             2 : Chinese
+   *             3 : Simplified Chinese
+   *             4 : Traditional Chinese
+   *             5 : Korean
+   *             6 : Dont know(default)
+   * 
+ * + * @return 文件编码,eg:UTF-8,GBK,GB2312形式(不确定的时候,返回可能的字符编码序列);若无,则返回null + * @throws FileNotFoundException + * @throws IOException + */ + public static Observer guessFileEncoding(File file, int languageHint) + throws FileNotFoundException, IOException { + return guessFileEncoding(file, new nsDetector(languageHint)); + } + + /** + * 获取文件的编码 + * + * @param file + * @param det + * @return + * @throws FileNotFoundException + * @throws IOException + */ + private static Observer guessFileEncoding(File file, nsDetector det) + throws FileNotFoundException, IOException { + // new Observer + Observer observer = new Observer(); + // set Observer + // The Notify() will be called when a matching charset is found. + det.Init(observer); + + BufferedInputStream imp = new BufferedInputStream(new FileInputStream( + file)); + byte[] buf = new byte[1024]; + int len; + boolean done = false; + boolean isAscii = false; + + while ((len = imp.read(buf, 0, buf.length)) != -1) { + // Check if the stream is only ascii. + isAscii = det.isAscii(buf, len); + if (isAscii) { + break; + } + // DoIt if non-ascii and not done yet. + done = det.DoIt(buf, len, false); + if (done) { + break; + } + } + imp.close(); + det.DataEnd(); + + if (isAscii) { + observer.encoding = "ASCII"; + observer.found = true; + } + + if (!observer.isFound()) { + String[] prob = det.getProbableCharsets(); + // // 这里将可能的字符集组合起来返回 + // for (int i = 0; i < prob.length; i++) { + // if (i == 0) { + // encoding = prob[i]; + // } else { + // encoding += "," + prob[i]; + // } + // } + if (prob.length > 0) { + // 在没有发现情况下,去第一个可能的编码 + observer.encoding = prob[0]; + } else { + observer.encoding = null; + } + } + return observer; + } + + /** + * @author liaohongwei + * @Description: 文件字符编码观察者, 但判断出字符编码时候调用 + * @date 2016年6月20日 下午2:27:06 + */ + public static class Observer implements nsICharsetDetectionObserver { + + /** + * @Fields encoding : 字符编码 + */ + private String encoding = null; + /** + * @Fields found : 是否找到字符集 + */ + private boolean found = false; + + @Override + public void Notify(String charset) { + this.encoding = charset; + this.found = true; + } + + public String getEncoding() { + return encoding; + } + + public boolean isFound() { + return found; + } + + @Override + public String toString() { + return "Observer [encoding=" + encoding + ", found=" + found + "]"; + } + } + +}