词频统计原数据和结果数据地址:https://download.csdn.net/download/LiHaoHang6/88845654?spm=1001.2014.3001.5501
运行效果展示:
原数据展示:
词频统计思路:
1:先通过BufferedReader来读取本地文本文件,之后将文本切割为String[]数组
2:创建一个Map来存储单词及其出现的次数
3:定义正则表达式,匹配单词
4:遍历每一条数据,通过正则匹配单词
5:在遍历中查询到匹配的单词进行统计出现的次数
6:按照value进行降序排序
7:将排序后的数据列表导出到CSV文件
词频统计代码
package **;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* ClasseName: wordFrequencyStatistics
* Package:
* @Author: CSDN李指导~
* @Create: 2024/2/20 - 14:33
* @Version: v1.0
* Description: 词频统计
**/
public class wordFrequencyStatistics {
public static void main(String[] args) {
String filePath = "D:\\www\\无标题2.txt";
try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
StringBuilder sb = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
String text = sb.toString().trim(); // 去除首尾空格后得到完整的文本内容
String[] comments = text.split("\\r?\\n"); // 根据行分隔符(\r或者\n)将文本切割成多个元素存入数组中
// 创建一个Map来存储单词及其出现的次数
Map<String, Integer> wordFrequencyMap = new HashMap<>();
// 定义正则表达式,匹配单词
String regex = "\\b\\p{L}+\\b";
Pattern pattern = Pattern.compile(regex);
// 遍历每条评论
for (String comment : comments) {
// 使用正则表达式匹配单词
Matcher matcher = pattern.matcher(comment);
// 查找并输出匹配到的单词
while (matcher.find()) {
String word = matcher.group().toLowerCase(); // 转换为小写以进行统计
// 统计每个单词的出现次数
wordFrequencyMap.put(word, wordFrequencyMap.getOrDefault(word, 0) + 1);
}
}
List<Map.Entry<String, Integer>> list = new ArrayList<>(wordFrequencyMap.entrySet());
// 使用Collections.sort()方法根据value排序
Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
// 根据value比较,o1.getValue() - o2.getValue()表示升序排序
//return o1.getValue() - o2.getValue();
// 根据value比较,o2.getValue() - o1.getValue()表示降序排序
return o2.getValue() - o1.getValue();
}
});
// 打印排序后的结果
for (Map.Entry<String, Integer> entry : list) {
System.out.println(entry.getKey() + ": " + entry.getValue());
}
// 将排序后的键值对列表导出到CSV文件
exportToCSV(list, "sorted_data.csv");
} catch (IOException e) {
e.printStackTrace();
}
}
/*数据导出*/
private static void exportToCSV(List<Map.Entry<String, Integer>> list, String filename) {
try (PrintWriter writer = new PrintWriter(new File(filename))) {
StringBuilder sb = new StringBuilder();
sb.append("Key,Value\n");
for (Map.Entry<String, Integer> entry : list) {
sb.append(entry.getKey()).append(",").append(entry.getValue()).append("\n");
}
writer.write(sb.toString());
System.out.println("Data has been exported to " + filename);
} catch (FileNotFoundException e) {
System.err.println("Error occurred while exporting data to CSV: " + e.getMessage());
}
}
}