数据集实例
java代码,编程
实例
我们要先创建三个类分别为WordCoutMain、WordCoutMapper、WordCoutReducer这三个类
对应的代码如下
WordCoutMain
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountMain{
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WordCountMain.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
WordCoutMapper
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
@Override
protected void map(LongWritable key1, Text value1, Context context)
throws IOException, InterruptedException {
String data = value1.toString();
String[] words = data.split(" ");
for(String w:words)
{
context.write(new Text(w),new LongWritable(1));
}
}
}
WordCoutReducer
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, LongWritable,Text, LongWritable> {
@Override
protected void reduce(Text k3, Iterable<LongWritable> v3,Context context) throws IOException,InterruptedException {
long total = 0;
for(LongWritable v:v3)
{
total+=v.get();
}
context.write(k3, new LongWritable(total));
}
}
对应的使用命令
hadoop jar 1.jar ch01.WordCountMain /user/data/input/hamlet.txt /user/data/output/ch1
hadoop jar 包名 引用主类 输入文件路径 输出文件路径
结果展示
学习链接:
在Ubuntu上用mapreduce进行词频统计(伪分布式)_mapreduce怎么统计txt文件词频终端-CSDN博客
利用mapreduce统计部门的最高工资_使用mapreduce查询某个部门中薪资最高的员工姓名,如果输出结果的格式为“薪资 员-CSDN博客
hadoop编程之工资序列化排序-CSDN博客
hadoop编程之部门工资求和-CSDN博客
hadoop编程之词频统计-CSDN博客