文章目录
- 构造让探测器失效的文件
- 文件编码探测原理
- 探测器Java实现版本
- 测试
构造让探测器失效的文件
我们用vscode打开一个文本文件的时候,默认会使用UTF-8编码,所以当文件不是UTF-8编码的时候就会乱码。
但是,好像notepad–这类编辑器就似乎总是能以正确的编码打开文本文件。
为什么呢?
notepad–这类编辑器真的总能以正确编码打开文件不出现乱码吗?
答案是否定的,不信,用下面的代码生成一个文件试一试,notepad–类编辑器是否能正确打开。
@Test
public void write() throws IOException {
Path path = Paths.get("F:\\tmp\\gb2312.txt");
BufferedWriter bw = Files.newBufferedWriter(path, Charset.forName("GB2312"));
for (int i = 1; i <= 100000; i++) {
bw.write("瑜多爱");
if (i % 100 == 0) {
bw.newLine();
}
}
bw.close();
}
见证奇迹的时候到了:
为什么会出现这种情况呢?
文件编码探测原理
其实,现在能找到的文件编码探测器,基本都是通过Mozilla4开源的探测器修改而来。
基本原理就是统计要检测文件中的所有字节落在不同编码区间的值的概率。
然后,选出所有可能编码中概率最大的作为文件编码。
探测器Java实现版本
Java版本很多都需要引入新的jar包,我这里找了一个不知道经过几手翻译的代码,改成了Java代码,并做了一点优化。
如果要求不高,可以尝试使用。
代码比较长,下面列不完,可以在https://download.csdn.net/download/trayvontang/89005882下载,
import lombok.Getter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
public class EncodingDetectHelper {
private static final int[][] GBFreq = new int[94][94];
private static final int[][] GBKFreq = new int[126][191];
private static final int[][] Big5Freq = new int[94][158];
private static final int[][] EUC_TWFreq = new int[94][94];
private static final int[][] KRFreq = new int[94][94];
private static final int[][] JPFreq = new int[94][94];
static {
initializeFrequencies();
}
/**
* 探测文件编码,默认最大探测前300M内容
* 默认探测UTF-8\GB2312\GBK\GB18030\UTF_16\ASCII 几种情况
* @param contentFile 文件
* @return 文件编码信息
* @throws IOException IO异常
*/
public static DetectEncoding detectEncoding(File contentFile) throws IOException {
// 默认检查300M
int length = 1024 * 1024 * 300;
Set<DetectEncoding> checkEncodingSet = Set.of(
DetectEncoding.UTF8, DetectEncoding.GB2312,
DetectEncoding.GBK, DetectEncoding.GB18030,
DetectEncoding.UTF_16,DetectEncoding.ASCII);
return detectEncoding(contentFile, length, checkEncodingSet);
}
/**
* 根据内容猜测文件编码
* @param contentFile 内容文件
* @param detectLength 探测内容长度
* @param checkEncodingSet 检查的编码列表
* @return 文件可能编码
* @throws IOException IO异常
*/
public static DetectEncoding detectEncoding(File contentFile, Integer detectLength, Set<DetectEncoding> checkEncodingSet) throws IOException {
long length = contentFile.length();
if (length < 4) {
return null;
}
if (detectLength == null || detectLength == 0) {
if (length < Integer.MAX_VALUE) {
detectLength = Math.toIntExact(length);
} else {
detectLength = Integer.MAX_VALUE;
}
}
if (detectLength > length) {
detectLength = Math.toIntExact(length);
}
byte[] contentByte = new byte[detectLength];
try (FileInputStream fis = new FileInputStream(contentFile)) {
int read = fis.read(contentByte, 0, 4);
if (read == -1) {
throw new RuntimeException("未读取到文件数据-" + contentFile.getAbsolutePath());
}
// 先检查BOM,快速判断
if (contentByte[0] == -17 && contentByte[1] == -69 && contentByte[2] == -65) { //EF BB BF
return DetectEncoding.UTF8;
} else if (contentByte[0] == -1 && contentByte[1] == -2
&& contentByte[2] == 0 && contentByte[3] == 0) { // FF FE 00 00
return DetectEncoding.UTF_32BE;
} else if (contentByte[0] == 0 && contentByte[1] == 0
&& contentByte[2] == -2 && contentByte[3] == -1) { // 00 00 FE FF
return DetectEncoding.UTF_32LE;
} else if (contentByte[0] == -2 && contentByte[1] == -1) { // FE FF
return DetectEncoding.UTF_16BE;
} else if (contentByte[0] == -1 && contentByte[1] == -2) { // FF FE
return DetectEncoding.UTF_16LE;
}
read = fis.read(contentByte, 4, detectLength - 4);
if (read == -1) {
throw new RuntimeException("读取到文件数据异常-" + contentFile.getAbsolutePath());
}
return detectEncoding(contentByte, checkEncodingSet);
}
}
private static DetectEncoding detectEncoding(byte[] contentByte, Set<DetectEncoding> checkEncodingSet) {
Map<DetectEncoding, Integer> indexScoreMap = new HashMap<>();
if (checkEncodingSet.contains(DetectEncoding.UTF8)) {
indexScoreMap.put(DetectEncoding.UTF8, utf8Probability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.GB2312)) {
indexScoreMap.put(DetectEncoding.GB2312, gb2312Probability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.GBK)) {
indexScoreMap.put(DetectEncoding.GBK, gbkProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.GB18030)) {
indexScoreMap.put(DetectEncoding.GB18030, gb18030Probability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.HZ)) {
indexScoreMap.put(DetectEncoding.HZ, hzProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.BIG5)) {
indexScoreMap.put(DetectEncoding.BIG5, big5Probability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.CNS11643)) {
indexScoreMap.put(DetectEncoding.CNS11643, eucTwProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.ISO2022CN)) {
indexScoreMap.put(DetectEncoding.ISO2022CN, iso2022CnProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.UNICODE)) {
indexScoreMap.put(DetectEncoding.UNICODE, utf16Probability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.EUC_KR)) {
indexScoreMap.put(DetectEncoding.EUC_KR, eucKrProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.CP949)) {
indexScoreMap.put(DetectEncoding.CP949, cp949Probability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.JOHAB)) {
indexScoreMap.put(DetectEncoding.JOHAB, 0);
}
if (checkEncodingSet.contains(DetectEncoding.ISO2022KR)) {
indexScoreMap.put(DetectEncoding.ISO2022KR, iso2022KrProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.ASCII)) {
indexScoreMap.put(DetectEncoding.ASCII, asciiProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.SJIS)) {
indexScoreMap.put(DetectEncoding.SJIS, sjisProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.EUC_JP)) {
indexScoreMap.put(DetectEncoding.EUC_JP, eucJpProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.ISO2022JP)) {
indexScoreMap.put(DetectEncoding.ISO2022JP, iso2022JpProbability(contentByte));
}
if (checkEncodingSet.contains(DetectEncoding.UNICODET)) {
indexScoreMap.put(DetectEncoding.UNICODET, 0);
}
if (checkEncodingSet.contains(DetectEncoding.UNICODES)) {
indexScoreMap.put(DetectEncoding.UNICODES, 0);
}
if (checkEncodingSet.contains(DetectEncoding.ISO2022CN_GB)) {
indexScoreMap.put(DetectEncoding.ISO2022CN_GB, 0);
}
if (checkEncodingSet.contains(DetectEncoding.ISO2022CN_CNS)) {
indexScoreMap.put(DetectEncoding.ISO2022CN_CNS, 0);
}
if (checkEncodingSet.contains(DetectEncoding.OTHER)) {
indexScoreMap.put(DetectEncoding.OTHER, 0);
}
// System.out.println(indexScoreMap);
Optional<Map.Entry<DetectEncoding, Integer>> max = indexScoreMap.entrySet()
.stream().max(Map.Entry.comparingByValue());
if (max.isPresent()) {
Map.Entry<DetectEncoding, Integer> entry = max.get();
Integer value = entry.getValue();
if (50 > value) { // Return OTHER if nothing scored above 50
return DetectEncoding.OTHER;
} else {
return entry.getKey();
}
} else {
return DetectEncoding.OTHER;
}
}
private static int gb2312Probability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) { // 非ASCII
dbchars++;
if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xF7 && (byte) 0xA1 <= contentByte[i + 1]
&& contentByte[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0xA1;
column = contentByte[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
// In GB high-freq character range
gbfreq += 200;
}
}
i++;
}
}
rangeValue = 50 * ((float) gbchars / (float) dbchars);
freqValue = 50 * ((float) gbfreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int gbkProbability(byte[] rawtext) {
int i, rawTextLen = rawtext.length;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < rawTextLen - 1; i++) {
if (rawtext[i] < 0) {
dbchars++;
// Extended GB range
if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7 && // Original GB range
(byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0xA1;
column = rawtext[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
} else if ((byte) 0x81 <= rawtext[i] && rawtext[i] <= (byte) 0xFE && (rawtext[i + 1] <= (byte) 0xFE || (byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E)) {
gbchars++;
totalfreq += 500;
row = rawtext[i] + 256 - 0x81;
if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
column = rawtext[i + 1] - 0x40;
} else {
column = rawtext[i + 1] + 256 - 0x40;
}
if (GBKFreq[row][column] != 0) {
gbfreq += GBKFreq[row][column];
}
}
i++;
}
}
rangeValue = 50 * ((float) gbchars / (float) dbchars);
freqValue = 50 * ((float) gbfreq / (float) totalfreq);
// For regular GB files, this would give the same score, so I handicap it slightly
return (int) (rangeValue + freqValue) - 1;
}
private static int gb18030Probability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, gbchars = 1;
long gbfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) {
dbchars++;
// Extended GB range
if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xF7 && // Original GB range
i + 1 < contentLength && (byte) 0xA1 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0xFE) {
gbchars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0xA1;
column = contentByte[i + 1] + 256 - 0xA1;
if (GBFreq[row][column] != 0) {
gbfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
gbfreq += 200;
}
} else if ((byte) 0x81 <= contentByte[i] && contentByte[i] <= (byte) 0xFE && i + 1 < contentLength && (contentByte[i + 1] <= (byte) 0xFE || (byte) 0x40 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0x7E)) {
gbchars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0x81;
if (0x40 <= contentByte[i + 1] && contentByte[i + 1] <= 0x7E) {
column = contentByte[i + 1] - 0x40;
} else {
column = contentByte[i + 1] + 256 - 0x40;
}
if (GBKFreq[row][column] != 0) {
gbfreq += GBKFreq[row][column];
}
} else if ((byte) 0x81 <= contentByte[i]
&& contentByte[i] <= (byte) 0xFE
&& // Extended GB range
i + 3 < contentLength && (byte) 0x30 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0x39
&& (byte) 0x81 <= contentByte[i + 2] && contentByte[i + 2] <= (byte) 0xFE && (byte) 0x30 <= contentByte[i + 3]
&& contentByte[i + 3] <= (byte) 0x39) {
gbchars++;
}
i++;
}
}
rangeValue = 50 * ((float) gbchars / (float) dbchars);
freqValue = 50 * ((float) gbfreq / (float) totalfreq);
// For regular GB files, this would give the same score, so I handicap it slightly
return (int) (rangeValue + freqValue) - 1;
}
private static int hzProbability(byte[] contentByte) {
int i, contentLength = contentByte.length;
long hzfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int hzstart = 0;
int row, column;
for (i = 0; i < contentLength; i++) {
if (contentByte[i] == '~') {
if (contentByte[i + 1] == '{') {
hzstart++;
i += 2;
while (i < contentLength - 1) {
if (contentByte[i] == 0x0A || contentByte[i] == 0x0D) {
break;
} else if (contentByte[i] == '~' && contentByte[i + 1] == '}') {
i++;
break;
} else if ((0x21 <= contentByte[i] && contentByte[i] <= 0x77) && (0x21 <= contentByte[i + 1] && contentByte[i + 1] <= 0x77)) {
row = contentByte[i] - 0x21;
column = contentByte[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
hzfreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
hzfreq += 200;
}
}
i += 2;
}
} else if (contentByte[i + 1] == '}') {
i++;
} else if (contentByte[i + 1] == '~') {
i++;
}
}
}
if (hzstart > 4) {
rangeValue = 50;
} else if (hzstart > 1) {
rangeValue = 41;
} else if (hzstart > 0) { // Only 39 in case the sequence happened to occur
rangeValue = 39; // in otherwise non-Hz text
} else {
rangeValue = 0;
}
freqValue = 50 * ((float) hzfreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int big5Probability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, bfchars = 1;
float rangeValue, freqValue;
long bffreq = 0, totalfreq = 1;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) {
dbchars++;
if ((byte) 0xA1 <= contentByte[i]
&& contentByte[i] <= (byte) 0xF9
&& (((byte) 0x40 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0xFE))) {
bfchars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0xA1;
if (0x40 <= contentByte[i + 1] && contentByte[i + 1] <= 0x7E) {
column = contentByte[i + 1] - 0x40;
} else {
column = contentByte[i + 1] + 256 - 0x61;
}
if (Big5Freq[row][column] != 0) {
bffreq += Big5Freq[row][column];
} else if (3 <= row && row <= 37) {
bffreq += 200;
}
}
i++;
}
}
rangeValue = 50 * ((float) bfchars / (float) dbchars);
freqValue = 50 * ((float) bffreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
/**
* EUC-TW (CNS 11643) encoding
*
* @param contentByte 内容字节
* @return 可能性
*/
private static int eucTwProbability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, cnschars = 1;
long cnsfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) { // in ASCII range
dbchars++;
if (i + 3 < contentLength && (byte) 0x8E == contentByte[i] && (byte) 0xA1 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0xB0
&& (byte) 0xA1 <= contentByte[i + 2] && contentByte[i + 2] <= (byte) 0xFE && (byte) 0xA1 <= contentByte[i + 3]
&& contentByte[i + 3] <= (byte) 0xFE) { // Planes 1 - 16
cnschars++;
// System.out.println("plane 2 or above CNS char");
// These are all less frequent chars so just ignore freq
i += 3;
} else if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xFE && // Plane 1
(byte) 0xA1 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0xFE) {
cnschars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0xA1;
column = contentByte[i + 1] + 256 - 0xA1;
if (EUC_TWFreq[row][column] != 0) {
cnsfreq += EUC_TWFreq[row][column];
} else if (35 <= row && row <= 92) {
cnsfreq += 150;
}
i++;
}
}
}
rangeValue = 50 * ((float) cnschars / (float) dbchars);
freqValue = 50 * ((float) cnsfreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int iso2022CnProbability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, isochars = 1;
long isofreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] == (byte) 0x1B && i + 3 < contentLength) { // Escape char ESC
if (contentByte[i + 1] == (byte) 0x24 && contentByte[i + 2] == 0x29 && contentByte[i + 3] == (byte) 0x41) { // GB Escape $ ) A
i += 4;
while (contentByte[i] != (byte) 0x1B) {
dbchars++;
if ((0x21 <= contentByte[i] && contentByte[i] <= 0x77) && (0x21 <= contentByte[i + 1] && contentByte[i + 1] <= 0x77)) {
isochars++;
row = contentByte[i] - 0x21;
column = contentByte[i + 1] - 0x21;
totalfreq += 500;
if (GBFreq[row][column] != 0) {
isofreq += GBFreq[row][column];
} else if (15 <= row && row < 55) {
isofreq += 200;
}
i++;
}
i++;
}
} else if (i + 3 < contentLength && contentByte[i + 1] == (byte) 0x24 && contentByte[i + 2] == (byte) 0x29
&& contentByte[i + 3] == (byte) 0x47) {
// CNS Escape $ ) G
i += 4;
while (contentByte[i] != (byte) 0x1B) {
dbchars++;
if ((byte) 0x21 <= contentByte[i] && contentByte[i] <= (byte) 0x7E && (byte) 0x21 <= contentByte[i + 1]
&& contentByte[i + 1] <= (byte) 0x7E) {
isochars++;
totalfreq += 500;
row = contentByte[i] - 0x21;
column = contentByte[i + 1] - 0x21;
if (EUC_TWFreq[row][column] != 0) {
isofreq += EUC_TWFreq[row][column];
} else if (35 <= row && row <= 92) {
isofreq += 150;
}
i++;
}
i++;
}
}
if (contentByte[i] == (byte) 0x1B && i + 2 < contentLength && contentByte[i + 1] == (byte) 0x28 && contentByte[i + 2] == (byte) 0x42) { // ASCII:
// ESC
// ( B
i += 2;
}
}
}
rangeValue = 50 * ((float) isochars / (float) dbchars);
freqValue = 50 * ((float) isofreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int utf8Probability(byte[] contentByte) {
int score;
int i, contentLength = contentByte.length;
int goodbytes = 0, asciibytes = 0;
for (i = 0; i < contentLength; i++) {
if ((contentByte[i] & (byte) 0x7F) == contentByte[i]) { // One byte
asciibytes++;
// Ignore ASCII, can throw off count
} else {
if (-64 <= contentByte[i] && contentByte[i] <= -33
&& i + 1 < contentLength && contentByte[i + 1] <= -65) {
// Two bytes
goodbytes += 2;
i++;
} else if (-32 <= contentByte[i] && contentByte[i] <= -17 && i + 2 < contentLength
&& contentByte[i + 1] <= -65 && contentByte[i + 2] <= -65) {
// Three bytes
goodbytes += 3;
i += 2;
}
}
}
if (asciibytes == contentLength) {
return 0;
}
score = (int) (100 * ((float) goodbytes / (float) (contentLength - asciibytes)));
// If not above 98, reduce to zero to prevent coincidental matches
// Allows for some (few) bad formed sequences
if (score > 98) {
return score;
} else if (score > 95 && goodbytes > 30) {
return score;
} else {
return 0;
}
}
private static int utf16Probability(byte[] rawtext) {
if (rawtext.length > 1 && ((byte) 0xFE == rawtext[0] && (byte) 0xFF == rawtext[1]) || // Big-endian
((byte) 0xFF == rawtext[0] && (byte) 0xFE == rawtext[1])) { // Little-endian
return 100;
}
return 0;
}
private static int asciiProbability(byte[] rawtext) {
int score = 75;
int i, rawTextLen = rawtext.length;
for (i = 0; i < rawTextLen; i++) {
if (rawtext[i] < 0) {
score = score - 5;
} else if (rawtext[i] == (byte) 0x1B) { // ESC (used by ISO 2022)
score = score - 5;
}
if (score <= 0) {
return 0;
}
}
return score;
}
private static int eucKrProbability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, krchars = 1;
long krfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) {
dbchars++;
if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xFE && (byte) 0xA1 <= contentByte[i + 1]
&& contentByte[i + 1] <= (byte) 0xFE) {
krchars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0xA1;
column = contentByte[i + 1] + 256 - 0xA1;
if (KRFreq[row][column] != 0) {
krfreq += KRFreq[row][column];
} else if (15 <= row && row < 55) {
krfreq += 0;
}
}
i++;
}
}
rangeValue = 50 * ((float) krchars / (float) dbchars);
freqValue = 50 * ((float) krfreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int cp949Probability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, krchars = 1;
long krfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) {
dbchars++;
if ((byte) 0x81 <= contentByte[i]
&& contentByte[i] <= (byte) 0xFE
&& ((byte) 0x41 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0x5A || (byte) 0x61 <= contentByte[i + 1]
&& contentByte[i + 1] <= (byte) 0x7A || (byte) 0x81 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0xFE)) {
krchars++;
totalfreq += 500;
if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xFE && (byte) 0xA1 <= contentByte[i + 1]
&& contentByte[i + 1] <= (byte) 0xFE) {
row = contentByte[i] + 256 - 0xA1;
column = contentByte[i + 1] + 256 - 0xA1;
if (KRFreq[row][column] != 0) {
krfreq += KRFreq[row][column];
}
}
}
i++;
}
}
rangeValue = 50 * ((float) krchars / (float) dbchars);
freqValue = 50 * ((float) krfreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int iso2022KrProbability(byte[] rawtext) {
int i;
for (i = 0; i < rawtext.length; i++) {
if (i + 3 < rawtext.length && rawtext[i] == 0x1b && (char) rawtext[i + 1] == '$' && (char) rawtext[i + 2] == ')'
&& (char) rawtext[i + 3] == 'C') {
return 100;
}
}
return 0;
}
private static int eucJpProbability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, jpchars = 1;
long jpfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) {
dbchars++;
if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xFE && (byte) 0xA1 <= contentByte[i + 1]
&& contentByte[i + 1] <= (byte) 0xFE) {
jpchars++;
totalfreq += 500;
row = contentByte[i] + 256 - 0xA1;
column = contentByte[i + 1] + 256 - 0xA1;
if (JPFreq[row][column] != 0) {
jpfreq += JPFreq[row][column];
} else if (15 <= row && row < 55) {
jpfreq += 0;
}
}
i++;
}
}
rangeValue = 50 * ((float) jpchars / (float) dbchars);
freqValue = 50 * ((float) jpfreq / (float) totalfreq);
return (int) (rangeValue + freqValue);
}
private static int iso2022JpProbability(byte[] rawtext) {
int i;
for (i = 0; i < rawtext.length; i++) {
if (i + 2 < rawtext.length && rawtext[i] == 0x1b && (char) rawtext[i + 1] == '$' && (char) rawtext[i + 2] == 'B') {
return 100;
}
}
return 0;
}
private static int sjisProbability(byte[] contentByte) {
int i, contentLength = contentByte.length;
int dbchars = 1, jpchars = 1;
long jpfreq = 0, totalfreq = 1;
float rangeValue, freqValue;
int row, column, adjust;
for (i = 0; i < contentLength - 1; i++) {
if (contentByte[i] < 0) {
dbchars++;
if (i + 1 < contentByte.length && ((byte) 0x81 <= contentByte[i] && contentByte[i] <= (byte) 0x9F || (byte) 0xE0 <= contentByte[i] && contentByte[i] <= (byte) 0xEF) && ((byte) 0x40 <= contentByte[i + 1] && contentByte[i + 1] <= (byte) 0x7E || contentByte[i + 1] <= (byte) 0xFC)) {
jpchars++;
totalfreq += 500;
row = contentByte[i] + 256;
column = contentByte[i + 1] + 256;
if (column < 0x9f) {
adjust = 1;
column -= 0x20;
} else {
adjust = 0;
column -= 0x7e;
}
if (row < 0xa0) {
row = ((row - 0x70) << 1) - adjust;
} else {
row = ((row - 0xb0) << 1) - adjust;
}
row -= 0x20;
column = 0x20; // 什么情况?
if (row < JPFreq.length && column < JPFreq[row].length && JPFreq[row][column] != 0) {
jpfreq += JPFreq[row][column];
}
i++;
} else if ((byte) 0xA1 <= contentByte[i] && contentByte[i] <= (byte) 0xDF) {
// half-width katakana, convert to full-width
}
}
}
rangeValue = 50 * ((float) jpchars / (float) dbchars);
freqValue = 50 * ((float) jpfreq / (float) totalfreq);
// For regular GB files, this would give the same score, so I handicap it slightly
return (int) (rangeValue + freqValue) - 1;
}
private static void initializeFrequencies() {
int i, j;
for (i = 0; i < 94; i++) {
for (j = 0; j < 94; j++) {
GBFreq[i][j] = 0;
}
}
for (i = 0; i < 126; i++) {
for (j = 0; j < 191; j++) {
GBKFreq[i][j] = 0;
}
}
for (i = 0; i < 94; i++) {
for (j = 0; j < 158; j++) {
Big5Freq[i][j] = 0;
}
}
for (i = 0; i < 94; i++) {
for (j = 0; j < 94; j++) {
EUC_TWFreq[i][j] = 0;
}
}
for (i = 0; i < 94; i++) {
for (j = 0; j < 94; j++) {
JPFreq[i][j] = 0;
}
}
// 文件太大,初始化缺失请从前面文件下载
GBFreq[20][35] = 599;
JPFreq[26][89] = 0;
}
@Getter
public enum DetectEncoding {
ISO2022CN_GB(1, "ISO2022CN_GB", "ISO-2022-CN-EXT", "ISO2022CN-GB"),
ISO2022CN_CNS(2, "ISO2022CN_CNS", "ISO-2022-CN-EXT", "ISO2022CN-CNS"),
CP949(3, "MS949", "x-windows-949", "CP949"),
UNICODES(4, "Unicode", "UTF-16", "Unicode (Simp)"),
UNICODET(5, "Unicode", "UTF-16", "Unicode (Trad)"),
SJIS(6, "SJIS", "Shift_JIS", "Shift-JIS"),
BIG5(7, "BIG5", "BIG5", "Big5"),
ASCII(8, "ASCII", "ASCII", "ASCII"),
GB18030(9, "GB18030", "GB18030", "GB18030"),
CNS11643(10, "EUC-TW", "EUC-TW", "CNS11643"),
UNICODE(11, "Unicode", "UTF-16", "Unicode"),
OTHER(12, "ISO8859_1", "ISO8859-1", "OTHER"),
GBK(13, "GBK", "GBK", "GBK"),
ISO2022CN(14, "ISO2022CN", "ISO-2022-CN", "ISO2022 CN"),
HZ(15, "ASCII", "HZ-GB-2312", "HZ"),
JOHAB(16, "Johab", "x-Johab", "Johab"),
ISO2022KR(17, "ISO2022KR", "ISO-2022-KR", "ISO 2022 KR"),
UTF8(18, "UTF-8", "UTF-8", "UTF-8"),
UTF8T(19, "UTF-8", "UTF-8", "UTF-8 (Trad)"),
ISO2022JP(20, "ISO2022JP", "ISO-2022-JP", "ISO 2022 JP"),
UTF8S(21, "UTF-8", "UTF-8", "UTF-8 (Simp)"),
GB2312(22, "GB2312", "GB2312", "GB-2312"),
EUC_JP(23, "EUC_JP", "EUC-JP", "EUC-JP"),
EUC_KR(24, "EUC_KR", "EUC-KR", "EUC-KR"),
UTF_16(24, "UTF-16", "UTF-16", "UTF-16"),
UTF_16BE(24, "UTF-16BE", "UTF-16BE", "UTF-16BE"),
UTF_16LE(24, "UTF-16LE", "UTF-16LE", "UTF-16LE"),
UTF_32(24, "UTF-32", "UTF-32", "UTF-32"),
UTF_32BE(24, "UTF-32BE", "UTF-32BE", "UTF-32BE"),
UTF_32LE(24, "UTF-32LE", "UTF-32LE", "UTF-32LE");
private final Integer id;
private final String javaName;
private final String htmlName;
private final String niceName;
DetectEncoding(Integer id, String javaName, String htmlName, String niceName) {
this.id = id;
this.javaName = javaName;
this.htmlName = htmlName;
this.niceName = niceName;
}
}
}
测试
@Test
public void detect() throws IOException {
File file = new File("F:\\tmp\\gb2312.txt");
System.out.println(EncodingDetectHelper.detectEncoding(file));
}