1环境基于 上一篇搭建
高可用分布式集群
2 官方提供MapReduce程序
#评估圆周率
cd /data/hadoop/share/hadoop/mapreduce/
hadoop jar hadoop-mapreduce-examples-3.4.0.jar pi 2 6
3 实例项目分析1
#预分析的文件如,如单词统计
#
#上传文件到hdfs
hdfs dfs -put word.txt /test/01/
#可以先测试下,在运行计算
[root@master11 01]# cat word.txt | python m.py |python r.py
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
good 1
six 1
good 1
foo 2
quux 1
labs 1
foo 1
bar 1
quux 1
good 1
six 1
good 1
foo 1
# hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/m.py -reducer /data/test/01/r.py -input /test/01/ -output /test/output1/
#拷贝文件到服务器本地
hadoop fs -copyToLocal /test/output1/part-00000 /root/part-00000
#查看
[root@master11 ~]# cat part-00000
bar 2
foo 6
good 4
labs 2
quux 4
six 2
[root@master11 01]# cat m.py
#!/usr/bin/env python
import sys
for line in sys.stdin:
line = line.strip()
words = line.split()
for word in words:
print '%s\t%s' % (word, 1)
[root@master11 01]# cat r.py
#!/usr/bin/env python
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
line = line.strip()
word, count = line.split('\t', 1)
try:
count = int(count)
except ValueError:
continue
if current_word == word:
current_count += count
else:
if current_word:
print '%s\t%s' % (current_word, current_count)
current_count = count
current_word = word
if current_word == word:
print '%s\t%s' % (current_word, current_count)
4项目实战分析2,nginx日志 IP计数
hdfs dfs -put t.log /test/01/
#执行计算
hadoop jar /data/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.4.0.jar -D stream.non.zero.exit.is.failure=false -mapper /data/test/01/map.py -reducer /data/test/01/red.py -input /test/01/t.log -output /test/output2/
#下载
hadoop fs -copyToLocal /test/output2/part-00000 /root/part-00000
#查看
[root@master11 01]# cat map.py
#!/usr/bin/python
import sys
import re
for line in sys.stdin:
ipaddress=re.compile(r'([\d.]*)')
match=ipaddress.match(line)
if match:
ip=match.group(1)
print ' %s\t%s' % (ip, 1)
[root@master11 01]# cat red.py
#!/usr/bin/python
#-*-coding:UTF-8 -*-
import sys
import os
import string
res = {}
for line in sys.stdin:
skey=line[0:-1]
if(res.has_key(skey)==False):
res[skey]=0
res[skey]=res[skey]+1
for key in res.keys():
print key+"\t"+str(res[key])
5 欢迎同学们一起交流