ceph
ceph osd lspools
rbd ls -p testpool
#查看 ceph 集群中有多少个 pool,并且每个 pool 容量及利 用情况
rados df
ceph -s
ceph osd tree
ceph df
ceph versions
ceph osd pool ls
ceph osd crush rule dump
ceph auth print-key client.admin
ceph orch host ls
ceph crash ls
ceph osd pool stats
ceph df detail
ceph osd stat
ceph mon stat
查看image rbd
rbd ls -p kube
ceph osd df
ceph osd pool autoscale-status
ceph:
10.240.62.11/12/13
root:autelceph2
用户名:autel
密码:Autonomy@Autel
13 Autel#3913
[root@ceph-admin ~]# ceph mgr services
{
"dashboard": "https://10.250.53.152:8443/",
"prometheus": "http://10.250.53.152:9283/"
}
kubectl logs -f qinzhao-cache-resunet-demo-pipeline-wbkkh-2890309351 -n qinzhao -c lustre-importer-preload
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
apps/jupyter/jupyter-web-app/upstream/base/configs/spawner_ui_config.yaml
kustomize build apps/jupyter/jupyter-web-app/upstream/overlays/istio | kubectl apply -f -
kustomize build apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow | kubectl apply -f -
kubectl get pods -n kubeflow -l kustomize.component=profiles
https://www.amazonaws.cn/ec2/instance-types/
kubectl taint node autel-poweredge-r750 nodetype=T4:NoExecute
kubectl taint node autelrobotics-gpu10 nodetype=RTX3090:NoExecute
autelrobotics-gpu10
kubectl taint node autel-poweredge-r750 nodetype:NoExecute-
kubectl taint node autelrobotics-gpu09 nodetype:NoSchedule-
kubectl taint node autelrobotics-gpu09 nodetype:NoExecute-
nodegroup=gpu:NoSchedule
kubectl taint nodes autelrobotics-gpu02 nodegroup=gpu:NoSchedule
kubectl label node autelrobotics-gpu02 gputype=A40
lsof -n -P -i:22
strace
kubectl get csinode
查看活跃进程个数
top -H -p 1
kubectl create secret tls ai-tls \
--namespace ai-test \
--key tls.key \
--cert tls.pem
https://github.com/NVIDIA/nvidia-docker/issues/1678
nvidia-container-cli -k -d /dev/tty info
ls -l /dev/char
cat /etc/nvidia-container-runtime/config.toml
stat -fc %T /sys/fs/cgroup/
sar -n TCP,ETCP 1
fdisk -l
ldd
# 修改后,重新挂载生效
# mount -o remount /dev/shm
nstat
mpstat -P all 1
slabtop
pcstat
netstat -ant | awk '{print $6}' | sort | uniq -c | sort -n
dmesg -T
pmap -x 1649 | sort -k 3 -n -r
cat /proc/1649/smaps | grep 7f4250021000
dump memory memory.dump 0x7f2340539000 0x7f235d553000
strings memory.dump
pidstat -p pid -r 1 1000
sudo ./stackcount ip_output
dmesg -Tw
perf
NetHogs
iftop -i eth0 -P -N
./opensnoop -Tn snmp-pass
slabtop
nfsstat -c
du -ah --max-depth=2 /var/log |sort -rh |head -10
./fileslower
ulimit -a
解决显存释放问题:
fuser -v /dev/nvidia*
lsof -Pni
netstat -n | awk '/^tcp/ {++S[$NF]} END {for(a in S) print a, S[a]}'
NFS运维:
systemctl status rpcbind nfs-server
nfsiostat
dmesg | grep nfs
exportfs -v
mpstat -P ALL 1
ss -t -a |grep "IP"
nfsstat -c
iostat
iostat -d -x -k 1
netstat -an | "IP:2049"
dstat
ps aux | grep /app
https://learnku.com/articles/39851
https://zhuanlan.zhihu.com/p/614314627
fdisk -l
blkid
nfsiostat 1
sar -b 1
iostat -m -d /dev/md0 1
strace -p pid 查看进程当前调用栈,查死循环或者卡顿时极为有用
strace -eopen /usr/local/kk-mail/service/dovecot/sbin/dovecot 查看进程当前打开了哪些文件
cat /proc/715765/*/task/stack
/proc/12544/task/12873/stack
systemtap
cat /var/log/Xorg.0.log |grep -i "nvidia"