一 网络聚合通信测试
以下测试用例为:
整集群测试,每节点进程数从2开始以2的幂次增加至满核心;
测试常见的通信聚合测试8个条目
二 测试前准备
- 待测节点已完成OS安装及基础配置
- 待测节点已配置完IP(若存在IB,则需要配置IB IP)
- 待测节点做完互信操作
- 所有节点具有共享存储
- 编译安装osu_benchmark测试工具至共享目录
三 测试
- 上传测试脚本至osu_benchmark测试工具目录,如:
/share/opt/osu/libexec/osu-micro-benchmarks/mpi/collective/
- 在该目录创建nodelist文件并填入待测节点IP
vim nodelist
10.186.121.102
10.186.121.103
10.186.121.104
10.186.121.105
10.186.121.106
10.186.121.107
10.186.121.108
10.186.121.109
10.186.121.110
........
- 在该目录创建processlist文件并填入测试进程数
#假设每节点总核数为64,从2开始,已2的幂次增加
vim processlist
2
4
8
16
32
64
- `脚本赋予执行权限
chmod +x osu_batch_test.sh
- 执行脚本进行测试
[root@linux ~]# bash osu_batch_test.sh
===============================================
>>> Please choose a number to continue:
1 osu_allgather
2 osu_allreduce
3 osu_alltoall
3 osu_barrier
5 osu_bcast
6 osu_gather
7 osu_reduce
8 osu_scatter
9 Exit
>>>input number>>>
- 执行完成后会所有的日志会保存在在当前目录下的log文件夹中
四 脚本
#!/bin/bash
current_dir=`pwd`
node_file=${current_dir}/nodelist
proc_file=${current_dir}/processlist
mkdir -p ${current_dir}/log
logfile=$current_path/log/
size=65536
if [ ! -f ${node_file} ] || [ ! -f ${proc_file} ];then
echo -e "Error: Nodes file ${node_file} or Process file ${proc_file} is not exist."
exit 1
fi
#获取节点及进程总数
cat ${proc_file} | grep -v "^#" | grep -v "^$" > process.temp
processlist=process.temp
cat ${node_file} | grep -v "^#" | grep -v "^$" > nodes.temp
nodelist=nodes.temp
count=`grep -v '^$' $processlist | wc -l `
nodenum=`grep -v '^$' $nodelist | wc -l `
if [ $count -eq 0 ] || [ $nodenum -eq 0 ];then
echo -e "Warning: Nodes file ${node_file} or process file ${proc_file} is empty, skip."
exit 1
fi
#获取进程数内容
proc_list=(`awk '{print $1}' $processlist`)
rm -rf $processlist
function test_osu_allgather() {
# test osu_allgather
echo -e "\n>>> Start to test osu_allgather :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_allgather.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_allgather -m $size >> ${logfile}/${nodenum}"nodes_osu_allgather.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_allgather.log"
}
function test_osu_allreduce(){
# test osu_allreduce
echo -e "\n>>> Start to test osu_allreduce :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_allreduce.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_allreduce -m $size >> ${logfile}/${nodenum}"nodes_osu_allreduce.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_allreduce.log"
}
function test_osu_alltoall() {
# test osu_alltoall
echo -e "\n>>> Start to test osu_alltoall :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_alltoall.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_alltoall -m $size >> ${logfile}/${nodenum}"nodes_osu_alltoall.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_alltoall.log"
}
function test_osu_barrier() {
# test osu_barrier
echo -e "\n>>> Start to test osu_barrier :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_barrier.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_barrier -m $size >> ${logfile}/${nodenum}"nodes_osu_barrier.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_barrier.log"
}
function test_osu_bcast() {
# test osu_bcast
echo -e "\n>>> Start to test osu_bcast :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_bcast.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_bcast -m $size >> ${logfile}/${nodenum}"nodes_osu_bcast.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_bcast.log"
}
function test_osu_gather() {
# test osu_gather
echo -e "\n>>> Start to test osu_gather :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_gather.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_gather -m $size >> ${logfile}/${nodenum}"nodes_osu_gather.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_gather.log"
}
function test_osu_reduce() {
# test osu_reduce
echo -e "\n>>> Start to test osu_reduce :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_reduce.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_reduce -m $size >> ${logfile}/${nodenum}"nodes_osu_reduce.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_reduce.log"
}
function test_osu_scatter() {
# test osu_scatter
echo -e "\n>>> Start to test osu_scatter :"
echo -e "--------------------------------------------------------------------------------------"
for ((i=0; i<$count; i++))
do
echo -e "\n>>> Start to test ppn=${proc_list[$i]} :"
echo -e "\n>>>> ppn=${proc_list[$i]}" >> ${logfile}/${nodenum}"nodes_osu_scatter.log"
mpirun -ppn ${proc_list[$i]} -hostfile ${node_file} ${current_dir}/osu_scatter -m $size >> ${logfile}/${nodenum}"nodes_osu_scatter.log"
sleep 2
done
echo "the current test time is $(date +%Y-%m-%d-%H%M%S)" >> ${logfile}/${nodenum}"nodes_osu_scatter.log"
}
# main function
# print menu
echo -e "==============================================="
while :
do
echo -e "\n>>> Please choose a number to continue:"
echo -e "1 osu_allgather"
echo -e "2 osu_allreduce"
echo -e "3 osu_alltoall"
echo -e "3 osu_barrier"
echo -e "5 osu_bcast"
echo -e "6 osu_gather"
echo -e "7 osu_reduce"
echo -e "8 osu_scatter"
echo -e "9 Exit"
# read input
read -p ">>>input number>>> " nu
if [[ "$nu" == "1" ]];then
test_osu_allgather
elif [[ "$nu" == "2" ]];then
test_osu_allreduce
elif [[ "$nu" == "3" ]];then
test_osu_alltoall
elif [[ "$nu" == "4" ]];then
test_osu_barrier
elif [[ "$nu" == "5" ]];then
test_osu_bcast
elif [[ "$nu" == "6" ]];then
test_osu_gather
elif [[ "$nu" == "7" ]];then
test_osu_reduce
elif [[ "$nu" == "8" ]];then
test_osu_scatter
elif [[ "$nu" == "9" ]];then
echo -e "\n>>> exit"
exit 0
else
echo -e "\033[41;37m unsupported input. \033[0m"
fi
done
日常总结,一起学习进步