一、开三台虚拟机进行试验(centos7)
1、初始操作
# 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld
# 关闭selinux
sudo sed -i 's/enforcing/disabled/' /etc/selinux/config # 永久
setenforce 0 # 临时
# 关闭swap
sudo swapoff -a # 临时
sed -ri 's/.*swap.*/#&/' /etc/fstab # 永久
# 关闭完swap后,一定要重启一下虚拟机!!!
sudo reboot
# 根据规划设置主机名
hostnamectl set-hostname k8s-master
hostnamectl set-hostname k8s-node1
hostnamectl set-hostname k8s-node2
# 在master添加hosts
cat >> /etc/hosts << EOF
192.168.113.120 k8s-master
192.168.113.121 k8s-node1
192.168.113.122 k8s-node2
EOF
# 将桥接的IPv4流量传递到iptables的链
cat > /etc/sysctl.d/k8s.conf << EOF
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
EOF
sysctl --system # 生效
# 时间同步
yum install ntpdate -y
ntpdate time.windows.com
2、可能会遇到的问题:
(1)用户无法执行sudo命令
www 不在 sudoers 文件中。此事将被报告。
# 进入root用户
su -
sudo visudo
# 打开/etc/sudoers文件
vi /etc/sudoers
# 添加下面一行
username ALL=(ALL:ALL) ALL
(2)更新仓库地址
CentOS 7 yum无法使用解决方法Could not retrieve mirrorlist http://mirrorlist.centos.org/?release=7&arch= - 愚生浅末 - 博客园
(3)vim命令无法使用
sudo yum install vim
二、安装基础软件
1、安装Docker
# step 1: 安装必要的一些系统工具
sudo yum install -y yum-utils device-mapper-persistent-data lvm2
# Step 2: 添加软件源信息
sudo yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
# PS:如果出现如下错误信息
Loaded plugins: fastestmirror
adding repo from: https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
grabbing file https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo to /etc/yum.repos.d/docker-ce.repo
Could not fetch/save url https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo to file /etc/yum.repos.d/docker-ce.repo: [Errno 14] curl#60 - "Peer's Certificate issuer is not recognized."
# 编辑 /etc/yum.conf 文件, 在 [main] 下面添加 sslverify=0 参数
vi /etc/yum.conf
# 配置如下----------------------
[main]
sslverify=0
# -----------------------------
# Step 3: 更新并安装Docker-CE
sudo yum makecache fast
sudo yum -y install docker-ce
# Step 4: 开启Docker服务
sudo service docker start
2、添加阿里云yum源
cat > /etc/yum.repos.d/kubernetes.repo << EOF
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=0
repo_gpgcheck=0
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF
3、安装kubeadm、kubelet、kubectl
yum install -y kubelet-1.23.6 kubeadm-1.23.6 kubectl-1.23.6
systemctl enable kubelet
# 配置关闭 Docker 的 cgroups,修改 /etc/docker/daemon.json,为以下内容
cat <<EOF> /etc/docker/daemon.json
{
"exec-opts": ["native.cgroupdriver=systemd"],
"registry-mirrors": ["https://kn0t2bca.mirror.aliyuncs.com"]
}
EOF
# 重启 docker
systemctl daemon-reload
systemctl restart docker
4、部署 Kubernetes Master
# 在 Master 节点下执行
kubeadm init \
--apiserver-advertise-address=192.168.129.131 \
--image-repository registry.aliyuncs.com/google_containers \
--kubernetes-version v1.23.6 \
--service-cidr=10.96.0.0/12 \
--pod-network-cidr=10.244.0.0/16
# 安装成功后,复制如下配置并执行
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
kubectl get nodes
成功之后,显示如下:
中途重启centos虚拟机之后,无法获取到IP地址,解决方法参考:
https://blog.csdn.net/what_where/article/details/103370726
5、加入 Kubernetes Node
分别在 k8s-node1 和 k8s-node2 执行
# 下方命令可以在 k8s master 控制台初始化成功后复制 join 命令
kubeadm join 192.168.129.131:6443 --token ssgbgk.sqcyma2trvi0h5d6 --discovery-token-ca-cert-hash sha256:9ddb0fabacb98052ef8dce742ed43108ea9b3b37938b8b5c839bf035427ead8b
# 如果初始化的 token 不小心清空了,可以通过如下命令获取或者重新申请
# 如果 token 已经过期,就重新申请
kubeadm token create
# token 没有过期可以通过如下命令获取
kubeadm token list
# 获取 --discovery-token-ca-cert-hash 值,得到值后需要在前面拼接上 sha256:
openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | \
openssl dgst -sha256 -hex | sed 's/^.* //'
# 在master节点确认节点是否加入
kubectl get nodes
NAME STATUS ROLES AGE VERSION
k8s-master NotReady control-plane,master 13h v1.23.6
k8s-node1 NotReady <none> 12h v1.23.6
k8s-node2 NotReady <none>
上面显示节点已经加入集群了,但是还是NotReady状态的,查看pods状态
[root@k8s-master ~]# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
coredns-6d8c4cb4d-jq8zl 0/1 Pending 0 13h
coredns-6d8c4cb4d-wkxbq 0/1 Pending 0 13h
可以看到coredns-*是Pending状态,这是由于网络问题导致的,下一步配置网络。
6、部署 CNI 网络插件
cd /opt
mkdir k8s
cd k8s
# 在 master 节点上执行
# 下载 calico 配置文件,可能会网络超时
curl https://calico-v3-25.netlify.app/archive/v3.25/manifests/calico.yaml -O
# 修改 calico.yaml 文件中的 CALICO_IPV4POOL_CIDR 配置,修改为与初始化的 cidr 相同
vim calico.yaml
# 修改 IP_AUTODETECTION_METHOD 下的网卡名称,(没找到,先不改)
# 查看镜像地址
[root@k8s-master k8s]# grep image calico.yaml
image: docker.io/calico/cni:v3.25.0
imagePullPolicy: IfNotPresent
image: docker.io/calico/cni:v3.25.0
imagePullPolicy: IfNotPresent
image: docker.io/calico/node:v3.25.0
imagePullPolicy: IfNotPresent
image: docker.io/calico/node:v3.25.0
imagePullPolicy: IfNotPresent
image: docker.io/calico/kube-controllers:v3.25.0
imagePullPolicy: IfNotPresent
从输出看,使用docker的默认的镜像地址,速度会很慢,需要修改
# 删除镜像 docker.io/ 前缀,避免下载过慢导致失败
sed -i 's#docker.io/##g' calico.yaml
# 再次查看镜像地址
[root@k8s-master k8s]# grep image calico.yaml
image: calico/cni:v3.25.0
imagePullPolicy: IfNotPresent
image: calico/cni:v3.25.0
imagePullPolicy: IfNotPresent
image: calico/node:v3.25.0
imagePullPolicy: IfNotPresent
image: calico/node:v3.25.0
imagePullPolicy: IfNotPresent
image: calico/kube-controllers:v3.25.0
imagePullPolicy: IfNotPresent
可以看到已经被修改了
# 更新网络配置资源
kubectl apply -f calico.yaml
kubectl:这是 Kubernetes 的命令行工具,用于与 Kubernetes 集群进行交互。它允许你运行命令来管理 Kubernetes 集群中的资源。
apply:apply 子命令用于根据提供的配置文件(在这个例子中是 calico.yaml)来应用或更新 Kubernetes 集群中的资源。这个命令会检查集群中现有的资源状态,并尝试将配置文件中定义的资源状态应用到集群中。如果资源已经存在,apply 命令会尝试更新它以匹配配置文件中的状态。如果资源不存在,则会创建它。
-f:这是一个标志(flag),用于指定后面跟着的是配置文件的路径。在这个例子中,-f 后面跟着的是 calico.yaml,表示 kubectl apply 命令将使用这个 YAML 文件作为输入来应用或更新资源。
calico.yaml:这是一个 YAML 格式的文件,包含了要在 Kubernetes 集群中部署或更新的资源定义。在这个calico.yaml 很可能是用于部署 Calico CNI(容器网络接口)插件的配置文件。Calico 是一个高性能的容器网络解决方案,为 Kubernetes 集群提供网络策略和网络连接功能。
# 再次查看pod状态
[root@k8s-master k8s]# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
calico-kube-controllers-cd8566cf-66hdp 0/1 Pending 0 30m
calico-node-5hpns 0/1 Init:ImagePullBackOff 0 30m
calico-node-gzhm7 0/1 Init:ImagePullBackOff 0 30m
calico-node-vbw4g 0/1 Init:ImagePullBackOff 0 30m
coredns-6d8c4cb4d-jq8zl 0/1 Pending 0 14h
coredns-6d8c4cb4d-wkxbq 0/1 Pending 0 14h
etcd-k8s-master 1/1 Running 1 14h
kube-apiserver-k8s-master 1/1 Running 1 14h
kube-controller-manager-k8s-master 1/1 Running 1 14h
kube-proxy-dmdpn 1/1 Running 0 13h
kube-proxy-rcwxh 1/1 Running 0 14h
kube-proxy-wmhnl 1/1 Running 0 13h
kube-scheduler-k8s-master 1/1 Running 1 14h
# 查看calico-kube-controllers-cd8566cf-66hdp信息
[root@k8s-master k8s]# kubectl describe po calico-kube-controllers-cd8566cf-66hdp -n kube-system
Name: calico-kube-controllers-cd8566cf-66hdp
Namespace: kube-system
Priority: 2000000000
Priority Class Name: system-cluster-critical
Node: <none>
Labels: k8s-app=calico-kube-controllers
pod-template-hash=cd8566cf
Annotations: <none>
Status: Pending
IP:
IPs: <none>
Controlled By: ReplicaSet/calico-kube-controllers-cd8566cf
Containers:
calico-kube-controllers:
Image: calico/kube-controllers:v3.25.0
Port: <none>
Host Port: <none>
Liveness: exec [/usr/bin/check-status -l] delay=10s timeout=10s period=10s #success=1 #failure=6
Readiness: exec [/usr/bin/check-status -r] delay=0s timeout=1s period=10s #success=1 #failure=3
Environment:
ENABLED_CONTROLLERS: node
DATASTORE_TYPE: kubernetes
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-5sj8x (ro)
Conditions:
Type Status
PodScheduled False
Volumes:
kube-api-access-5sj8x:
Type: Projected (a volume that contains injected data from multiple sources)
TokenExpirationSeconds: 3607
ConfigMapName: kube-root-ca.crt
ConfigMapOptional: <nil>
DownwardAPI: true
QoS Class: BestEffort
Node-Selectors: kubernetes.io/os=linux
Tolerations: CriticalAddonsOnly op=Exists
node-role.kubernetes.io/control-plane:NoSchedule
node-role.kubernetes.io/master:NoSchedule
node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 49s (x31 over 31m) default-scheduler 0/3 nodes are available: 3 node(s) had taint {node.kubernetes.io/not-ready: }, that the pod didn't tolerate.
最后一行:3 node(s) had taint,三个节点都有污点,not-ready状态,并且没有配置容忍tolerate
遇到问题:calico-node-5hpns 0/1 Init:ImagePullBackOff 0 38m
参考下面的文档:
K8s的Pod出现Init:ImagePullBackOff问题的解决,(以calico网络插件为例)-CSDN博客
部署k8s安装Calico插件提示镜像拉取失败 Init:ImagePullBackOff(新增镜像仓库地址解决)_calico镜像拉取失败-CSDN博客
k8s安装网络组件calico报错Init:ImagePullBackOff - lucky_tomato - 博客园
vim /etc/docker/daemon.json 中添加镜像源
{
"registry-mirrors": ["https://docker.m.daocloud.io","https://p5lmkba8.mirror.aliyuncs.com","https://registry.docker-cn.com"]
}
# 重启
sudo systemctl daemon-reload
sudo systemctl restart docker
# 查看pod所在node
kubectl get pods -n kube-system -o wide
# 到报错的机器上面手动下载镜像,可以参考其它成功机器的镜像
docker pull calico/cni:v3.25.0
# 然后删除失败Pod
kubectl get pods -n kube-system | grep calico-node-bvvhc | awk '{print$1}'| xargs kubectl delete -n kube-system pods
# 再次查看pods
[root@k8s-master k8s]# kubectl get po -n kube-system -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
calico-kube-controllers-cd8566cf-66hdp 1/1 Running 0 120m 10.244.235.198 k8s-master <none> <none>
calico-node-d26xm 1/1 Running 1 (28m ago) 42m 192.168.129.133 k8s-node2 <none> <none>
calico-node-qmpn8 1/1 Running 1 (29m ago) 42m 192.168.129.131 k8s-master <none> <none>
calico-node-xf5h2 1/1 Running 0 42m 192.168.129.132 k8s-node1 <none> <none>
coredns-6d8c4cb4d-jq8zl 1/1 Running 1 (28m ago) 15h 10.244.235.197 k8s-master <none> <none>
coredns-6d8c4cb4d-wkxbq 1/1 Running 1 (28m ago) 15h 10.244.235.196 k8s-master <none> <none>
etcd-k8s-master 1/1 Running 4 (29m ago) 15h 192.168.129.131 k8s-master <none> <none>
kube-apiserver-k8s-master 1/1 Running 4 (28m ago) 15h 192.168.129.131 k8s-master <none> <none>
kube-controller-manager-k8s-master 1/1 Running 4 (29m ago) 15h 192.168.129.131 k8s-master <none> <none>
kube-proxy-dmdpn 1/1 Running 2 (28m ago) 15h 192.168.129.133 k8s-node2 <none> <none>
kube-proxy-rcwxh 1/1 Running 3 (29m ago) 15h 192.168.129.131 k8s-master <none> <none>
kube-proxy-wmhnl 1/1 Running 2 (26m ago) 15h 192.168.129.132 k8s-node1 <none> <none>
kube-scheduler-k8s-master 1/1 Running 4 (29m ago) 15h 192.168.129.131 k8s-master <none> <none>
# 再次查看node
[root@k8s-master k8s]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
k8s-master Ready control-plane,master 15h v1.23.6
k8s-node1 Ready <none> 15h v1.23.6
k8s-node2 Ready <none> 15h v1.23.6
此时都是正常运行的状态了
7、测试 kubernetes 集群
# 创建部署
kubectl create deployment nginx --image=nginx
# 暴露端口
kubectl expose deployment nginx --port=80 --type=NodePort
# 查看 pod 以及服务信息
[root@k8s-master k8s]# kubectl get pod,svc
NAME READY STATUS RESTARTS AGE
pod/nginx-85b98978db-zwnkq 0/1 ContainerCreating 0 12s
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 15h
service/nginx NodePort 10.106.36.179 <none> 80:32268/TCP 5s
# 链接测试
[root@k8s-master k8s]# curl 192.168.129.131:32268
<!DOCTYPE html>
<html>
<head>
<title>Welcome to nginx!</title>
<style>
html { color-scheme: light dark; }
body { width: 35em; margin: 0 auto;
font-family: Tahoma, Verdana, Arial, sans-serif; }
</style>
</head>
<body>
<h1>Welcome to nginx!</h1>
<p>If you see this page, the nginx web server is successfully installed and
working. Further configuration is required.</p>
<p>For online documentation and support please refer to
<a href="http://nginx.org/">nginx.org</a>.<br/>
Commercial support is available at
<a href="http://nginx.com/">nginx.com</a>.</p>
<p><em>Thank you for using nginx.</em></p>
</body>
</html>
此时通过浏览器访问192.168.129.131:32268、192.168.129.132:32268、192.168.129.133:32268都可以访问到nginx的默认页面。
常用命令
# 查看kubelet服务状态
systemctl status kubelet
# 查看journal
journalctl -u kubelet --no-pager
journalctl 的 -u 参数可以指定服务进行过滤,这样可以屏蔽掉其他无关日志。 --no-pager 参数可以一次性输出日志,当然如果你只是在线查看,则可以不用这个参数,只是输出日志受到屏幕宽度限制,需要通过方向键滚动。
另外一种写法
journalctl -xefu kubelet
journalctl:这是主命令,用于访问 systemd 的日志。
-x:这个选项表示在输出中包含详细的解释性信息,有助于理解日志条目的含义和上下文。
-e:这个选项表示仅显示最新的日志条目,通常用于快速查看最近的日志信息。
-f:这个选项表示实时跟踪日志输出,类似于 tail -f 命令,可以持续显示新的日志条目。
-u kubelet:这个选项指定了要查看的 systemd 单元(unit),在这个例子中是 kubelet。systemd 单元是 systemd 系统和服务管理器的基本构建块,表示系统资源或服务。
# 查看docker的driver
docker info | grep Driver
# 查看token
# kubeadm token list
TOKEN TTL EXPIRES USAGES DESCRIPTION EXTRA GROUPS
ssgbgk.sqcyma2trvi0h5d6 23h 2024-12-29T16:25:00Z authentication,signing The default bootstrap token generated by 'kubeadm init'. system:bootstrappers:kubeadm:default-node-token
# 获取cert-hash值
# 获取 --discovery-token-ca-cert-hash 值,得到值后需要在前面拼接上 sha256:
openssl x509 -pubkey -in /etc/kubernetes/pki/ca.crt | openssl rsa -pubin -outform der 2>/dev/null | \
openssl dgst -sha256 -hex | sed 's/^.* //'
# 查看组件状态
kubectl get componentstatus
# 或者简写kubectl get cs
Warning: v1 ComponentStatus is deprecated in v1.19+
NAME STATUS MESSAGE ERROR
scheduler Healthy ok
controller-manager Healthy ok
etcd-0 Healthy {"health":"true","reason":""}
# 查看pod状态
kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
coredns-6d8c4cb4d-jq8zl 0/1 Pending 0 13h
coredns-6d8c4cb4d-wkxbq 0/1 Pending 0 13h
etcd-k8s-master 1/1 Running 1 13h
kube-apiserver-k8s-master 1/1 Running 1 13h
kube-controller-manager-k8s-master 1/1 Running 1 13h
kube-proxy-dmdpn 1/1 Running 0 12h
kube-proxy-rcwxh 1/1 Running 0 13h
kube-proxy-wmhnl 1/1 Running 0 12h
kube-scheduler-k8s-master 1/1 Running 1 13h
报错如下
"Failed to run kubelet" err="failed to run Kubelet: running with swap on is not supported, please disable swap! or set --fail-swap-on flag to false. /proc/swaps contained: [Filename\t\t\t\tType\t\tSize\tUsed\tPriority /dev/sda2
需要重新禁用swap,然后重启
报错如下:
[init] Using Kubernetes version: v1.23.6
[preflight] Running pre-flight checks
[WARNING Service-Docker]: docker service is not enabled, please run 'systemctl enable docker.service'
[WARNING SystemVerification]: this Docker version is not on the list of validated versions: 26.1.4. Latest validated version: 20.10
error execution phase preflight: [preflight] Some fatal errors occurred:
[ERROR Port-6443]: Port 6443 is in use
[ERROR Port-10259]: Port 10259 is in use
[ERROR Port-10257]: Port 10257 is in use
[ERROR FileAvailable--etc-kubernetes-manifests-kube-apiserver.yaml]: /etc/kubernetes/manifests/kube-apiserver.yaml already exists
[ERROR FileAvailable--etc-kubernetes-manifests-kube-controller-manager.yaml]: /etc/kubernetes/manifests/kube-controller-manager.yaml already exists
[ERROR FileAvailable--etc-kubernetes-manifests-kube-scheduler.yaml]: /etc/kubernetes/manifests/kube-scheduler.yaml already exists
[ERROR FileAvailable--etc-kubernetes-manifests-etcd.yaml]: /etc/kubernetes/manifests/etcd.yaml already exists
[ERROR Port-10250]: Port 10250 is in use
[ERROR Port-2379]: Port 2379 is in use
[ERROR Port-2380]: Port 2380 is in use
[ERROR DirAvailable--var-lib-etcd]: /var/lib/etcd is not empty
[preflight] If you know what you are doing, you can make a check non-fatal with `--ignore-preflight-errors=...`
解决方法:
kubeadm reset
参考:3.1.4_搭建k8s集群-命令行工具:在任意节点使用kubectl_哔哩哔哩_bilibili