一、部署 CSI 和镜像准备
1.1 部署命令
kubectl apply -f https://github.com/qiniu/kubernetes-csi-driver/releases/download/v0.2.0/kodo-plugin.yaml
1.2 kodo-plugin.yaml 配置文件(国内镜像版本)
---
apiVersion: storage.k8s.io/v1
kind: CSIDriver
metadata:
name: kodoplugin.storage.qiniu.com
spec:
attachRequired: false
podInfoOnMount: true
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kodo-csi-plugin
namespace: test-s3
spec:
selector:
matchLabels:
app: kodo-csi-plugin
template:
metadata:
labels:
app: kodo-csi-plugin
spec:
serviceAccount: sa.kodoplugin.storage.qiniu.com
tolerations:
- operator: Exists
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-node-critical
hostNetwork: true
hostPID: true
hostIPC: true
containers:
- name: csi-driver-registrar
image: k8s-gcr.m.daocloud.io/sig-storage/csi-node-driver-registrar:v2.5.0
imagePullPolicy: IfNotPresent
args:
- "--v=5"
- "--csi-address=/var/lib/kubelet/csi-plugins/kodoplugin.storage.qiniu.com/csi.sock"
- "--kubelet-registration-path=/var/lib/kubelet/csi-plugins/kodoplugin.storage.qiniu.com/csi.sock"
- "--plugin-registration-path=/registration"
volumeMounts:
- name: kubelet-dir
mountPath: /var/lib/kubelet/
- name: registration-dir
mountPath: /registration
livenessProbe:
exec:
command:
- /csi-node-driver-registrar
- --plugin-registration-path=/registration
- --kubelet-registration-path=/var/lib/kubelet/csi-plugins/kodoplugin.storage.qiniu.com/csi.sock
- --mode=kubelet-registration-probe
initialDelaySeconds: 30
timeoutSeconds: 15
- name: kodo-plugin
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: kodoproduct/csi-plugin.storage.qiniu.com:v0.2.0
imagePullPolicy: IfNotPresent
command: ["/bin/bash"]
args:
- "-c"
- "/entrypoint.sh --endpoint=$(CSI_ENDPOINT) --v=2 --nodeid=$(KUBE_NODE_NAME) --driver=kodo --health-port=11261"
env:
- name: KUBE_NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: CSI_ENDPOINT
value: unix:///var/lib/kubelet/csi-plugins/kodoplugin.storage.qiniu.com/csi.sock
livenessProbe:
httpGet:
path: /health
port: health
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 35
failureThreshold: 35
ports:
- name: health
containerPort: 11261
protocol: TCP
volumeMounts:
- name: kubelet-dir
mountPath: /var/lib/kubelet/
mountPropagation: Bidirectional
- name: host-log
mountPath: /var/log/qiniu/
- name: bin-dir
mountPath: /host/usr/local/bin/
- name: systemd-dir
mountPath: /host/etc/systemd/system/
- name: socket-dir
mountPath: /var/lib/qiniu/
mountPropagation: Bidirectional
- name: host-mount
mountPath: /mnt
mountPropagation: Bidirectional
- name: host-dev
mountPath: /dev
- name: host-sys
mountPath: /sys
- name: host-proc
mountPath: /host/proc
volumes:
- name: registration-dir
hostPath:
path: /var/lib/kubelet/plugins_registry
type: DirectoryOrCreate
- name: socket-dir
hostPath:
path: /var/lib/qiniu/
type: DirectoryOrCreate
- name: kubelet-dir
hostPath:
path: /var/lib/kubelet
type: Directory
- name: host-log
hostPath:
path: /var/log/qiniu/
type: DirectoryOrCreate
- name: bin-dir
hostPath:
path: /usr/local/bin/
type: DirectoryOrCreate
- name: systemd-dir
hostPath:
path: /etc/systemd/system/
type: DirectoryOrCreate
- name: host-mount
hostPath:
path: /mnt
type: Directory
- name: host-dev
hostPath:
path: /dev
type: Directory
- name: host-sys
hostPath:
path: /sys
type: Directory
- name: host-proc
hostPath:
path: /proc
type: Directory
updateStrategy:
rollingUpdate:
maxUnavailable: 10%
type: RollingUpdate
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: sa.kodoplugin.storage.qiniu.com
namespace: test-s3
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: role.kodoplugin.storage.qiniu.com
rules:
- apiGroups: [""]
resources: ["persistentvolumes", "endpoints", "configmaps"]
verbs: ["get", "list", "watch", "create", "delete", "update"]
- apiGroups: [""]
resources: ["persistentvolumeclaims", "nodes"]
verbs: ["get", "list", "watch", "update"]
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
- apiGroups: [""]
resources: ["secrets", "namespaces"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch", "update"]
- apiGroups: ["storage.k8s.io"]
resources: ["volumeattachments", "volumeattachments/status"]
verbs: ["get", "list", "watch", "update", "patch"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: binding.kodoplugin.storage.qiniu.com
subjects:
- kind: ServiceAccount
name: sa.kodoplugin.storage.qiniu.com
namespace: test-s3
roleRef:
kind: ClusterRole
name: role.kodoplugin.storage.qiniu.com
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kodo-provisioner
namespace: test-s3
spec:
selector:
matchLabels:
app: kodo-provisioner
replicas: 2
template:
metadata:
labels:
app: kodo-provisioner
spec:
serviceAccount: sa.kodoplugin.storage.qiniu.com
tolerations:
- operator: Exists
nodeSelector:
kubernetes.io/os: linux
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: Exists
priorityClassName: system-node-critical
hostNetwork: true
containers:
- name: external-kodo-provisioner
securityContext:
privileged: true
image: gcr.m.daocloud.io/k8s-staging-sig-storage/csi-provisioner:canary
args:
- "--csi-address=$(ADDRESS)"
- "--volume-name-prefix=kodo"
- "--timeout=150s"
- "--leader-election=true"
- "--retry-interval-start=500ms"
- "--v=5"
env:
- name: ADDRESS
value: /var/lib/kubelet/csi-plugins/kodoplugin.storage.qiniu.com/csi.sock
imagePullPolicy: IfNotPresent
volumeMounts:
- name: kubelet-dir
mountPath: /var/lib/kubelet/
mountPropagation: Bidirectional
volumes:
- name: kubelet-dir
hostPath:
path: /var/lib/kubelet
type: Directory
二、源码打包镜像
2.1 目录说明
镜像目录:
- 10.10.207.16:
/data/luozhihong/s3/CSI/kubernetes-csi-driver-0.2.0 - 10.10.207.18:
/root/luozhihong/kubernetes-csi-driver-0.2.0
部署目录:
/data/luozhihong/s3
说明: 由于官方镜像系统架构不一致,需要拉取代码手动打包镜像
2.2 构建步骤
Step 1: 拉取代码
# 拉取代码(需要 tag 0.2.0)
git clone https://github.com/qiniu/kubernetes-csi-driver.git
cd kubernetes-csi-driver-0.2.0
Step 2: 准备系统命令
# 创建目录
mkdir -p docker/amd64
# 测试 rclone 是否正常
rclone version
# 拷贝系统命令到容器
cp /usr/local/bin/rclone docker/amd64/
cp /usr/local/bin/kodofs docker/amd64/
Step 3: 如果 rclone 不正常,下载并替换
# 下载 rclone
wget https://downloads.rclone.org/v1.64.0/rclone-v1.64.0-linux-amd64.zip
unzip rclone-v1.64.0-linux-amd64.zip
# 替换 rclone 命令
cp rclone-v1.64.0-linux-amd64/rclone /usr/local/bin/
2.3 Dockerfile 配置(国内镜像版本)
# 1. 构建阶段
FROM docker.m.daocloud.io/library/golang:1.21-alpine3.18 AS build-env
ARG TARGETOS
ARG TARGETARCH
COPY . /app
WORKDIR /app
# 安装依赖
RUN apk add --no-cache git make
# 设置国内代理并编译
RUN export GOPROXY=https://goproxy.cn,direct && \
GOOS=$TARGETOS GOARCH=$TARGETARCH make build
# 2. 最终镜像
FROM docker.m.daocloud.io/library/alpine:3.18
ARG TARGETOS
ARG TARGETARCH
ARG PLUGIN_FILENAME=plugin.storage.qiniu.com
ARG CONNECTOR_FILENAME=connector.${PLUGIN_FILENAME}
# 拷贝编译结果
COPY --from=build-env /app/plugin/${PLUGIN_FILENAME} /usr/local/bin/${PLUGIN_FILENAME}
COPY --from=build-env /app/connector/${CONNECTOR_FILENAME} /usr/local/bin/${CONNECTOR_FILENAME}
# 拷贝额外文件
COPY docker/${TARGETARCH}/kodofs /usr/local/bin/kodofs
COPY docker/${TARGETARCH}/rclone /usr/local/bin/rclone
COPY docker/kodo-csi-connector.service /csiplugin-connector.service
COPY docker/entrypoint.sh /entrypoint.sh
# 赋予执行权限
RUN chmod +x /usr/local/bin/kodofs \
/usr/local/bin/rclone \
/usr/local/bin/${PLUGIN_FILENAME} \
/usr/local/bin/${CONNECTOR_FILENAME} \
/entrypoint.sh
# 安装运行依赖
RUN apk add --no-cache util-linux ca-certificates bash
ENTRYPOINT ["/entrypoint.sh"]
2.4 构建和导入镜像
# 构建镜像
docker build -t kodoproduct/csi-plugin.storage.qiniu.com:v0.2.0 .
# 导出镜像
docker save -o csi.tar kodoproduct/csi-plugin.storage.qiniu.com:v0.2.0
# 导入到 containerd
ctr -n k8s.io images import csi.tar
2.5 部署 CSI
kubectl apply -f kodo-plugin.yaml
三、使用指定的存储空间
3.1 创建 Secret
s3-secret.yaml:
apiVersion: v1
kind: Secret
metadata:
name: kodo-csi-pv-secret
namespace: test-s3
type: Opaque
stringData:
accesskey: "AK"
secretkey: "SK"
bucketname: "ruantong-test"
ucendpoint: "http://uc.qiniuapi.com"
storageclass: "STANDARD"
region: "z2"
subdir: ""
s3forcepathstyle: "false"
kubectl apply -f s3-secret.yaml
3.2 创建 PV 和 PVC
s3-pv-pvc.yaml:
---
# PV - 持久卷
apiVersion: v1
kind: PersistentVolume
metadata:
name: kodo-csi-pv
labels:
kodo-pvname: kodo-csi-pv
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
csi:
driver: kodoplugin.storage.qiniu.com
volumeHandle: kodo-csi-pv
volumeAttributes:
uploadcutoff: "209715200"
uploadchunksize: "5242880"
uploadconcurrency: "4"
vfscachemode: "off"
dircacheduration: "5m0s"
buffersize: "16777216"
vfsreadahead: "0"
vfscachemaxage: "1h0m0s"
vfscachepollinterval: "1m0s"
vfswriteback: "5s"
vfsreadchunksize: "134217728"
nochecksum: "no"
nomodtime: "no"
noseek: "no"
readonly: "no"
transfers: "4"
nodePublishSecretRef:
name: kodo-csi-pv-secret
namespace: test-s3
---
# PVC - 持久卷声明
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: kodo-pvc
namespace: test-s3
spec:
accessModes:
- ReadWriteMany
storageClassName: ''
resources:
requests:
storage: 10Gi
selector:
matchLabels:
kodo-pvname: kodo-csi-pv
kubectl apply -f s3-pv-pvc.yaml
四、动态创建存储空间
4.1 创建 Secret
s3-secret.yaml:
apiVersion: v1
kind: Secret
metadata:
name: kodo-csi-sc-secret
namespace: test-s3
type: Opaque
stringData:
accesskey: "AK"
secretkey: "SK"
bucketname: "ruantong-test"
ucendpoint: "http://uc.qiniuapi.com"
storageclass: "STANDARD"
region: "z2"
subdir: ""
s3forcepathstyle: "false"
kubectl apply -f s3-secret.yaml
4.2 创建 StorageClass
sc.yaml:
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: kodo-csi-sc
parameters:
uploadcutoff: "209715200"
uploadchunksize: "5242880"
uploadconcurrency: "4"
vfscachemode: "off"
dircacheduration: "5m0s"
buffersize: "16777216"
vfsreadahead: "0"
vfscachemaxage: "1h0m0s"
vfscachepollinterval: "1m0s"
vfswriteback: "5s"
vfsreadchunksize: "134217728"
nochecksum: "no"
nomodtime: "no"
noseek: "no"
readonly: "no"
transfers: "4"
csi.storage.k8s.io/provisioner-secret-name: kodo-csi-sc-secret
csi.storage.k8s.io/provisioner-secret-namespace: test-s3
provisioner: kodoplugin.storage.qiniu.com
reclaimPolicy: Retain # Retain:保留存储空间及数据 | Delete:删除存储空间及数据
kubectl apply -f sc.yaml
4.3 创建 PVC
pvc.yaml:
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: kodo-pvc
namespace: test-s3
spec:
accessModes:
- ReadWriteMany
storageClassName: kodo-csi-sc
resources:
requests:
storage: 10Gi
kubectl apply -f pvc.yaml
五、部署模型
5.1 部署 BGE-M3 模型
bge-m3-s3.yaml:
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-bge-m3
namespace: test-s3
spec:
replicas: 1
selector:
matchLabels:
app: vllm-bge-m3
template:
metadata:
labels:
app: vllm-bge-m3
spec:
nodeSelector:
kubernetes.io/hostname: bcm-headnode
containers:
- name: vllm-bge-m3
image: bge-m3:v0.1.1
command:
- vllm
- serve
- /kodo
- --served-model-name=bge-m3
- --host=0.0.0.0
- --port=5000
- --tensor-parallel-size=1
- --max-model-len=4096
- --gpu-memory-utilization=0.3
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
ports:
- containerPort: 5000
name: http
volumeMounts:
- name: kodo-storage
mountPath: /kodo
readOnly: true
- name: shm
mountPath: /dev/shm
resources:
limits:
memory: "8Gi"
cpu: "4"
requests:
memory: "8Gi"
cpu: "2"
volumes:
- name: kodo-storage
persistentVolumeClaim:
claimName: kodo-pvc
- name: shm
emptyDir:
medium: Memory
sizeLimit: 4Gi
---
apiVersion: v1
kind: Service
metadata:
name: vllm-service
namespace: test-s3
spec:
type: NodePort
selector:
app: vllm-bge-m3
ports:
- name: http
protocol: TCP
port: 5000
targetPort: 5000
nodePort: 30501
kubectl apply -f bge-m3-s3.yaml
5.2 测试模型
curl http://localhost:30501/v1/embeddings \
-H "Content-Type: application/json" \
-d '{
"model": "bge-m3",
"input": "你好,vLLM 向量服务测试"
}'
六、问题排查
6.1 部署 CSI 时常见问题
⚠️ 问题 1: 镜像无法拉取
解决方案: 需要切换成国内可访问的镜像
⚠️ 问题 2: kodoproduct/csi-plugin.storage.qiniu.com:v0.2.0 镜像架构不兼容
原因: DockerHub 镜像不兼容系统架构
解决方案:
- 拉取代码仓库的代码手动打包镜像
- 修改 Dockerfile 中的
FROM为国内镜像源 - 检查
rclone version是否正常 - 如果不正常,下载并替换 rclone:
wget https://downloads.rclone.org/v1.64.0/rclone-v1.64.0-linux-amd64.zipcp rclone-v1.64.0-linux-amd64/rclone /usr/local/bin/ - 打镜像时:
- 新建目录:
docker/amd64 - 拷贝主机系统的
/usr/local/bin/rclone和/usr/local/bin/kodofs到docker/amd64/
- 新建目录:
docker build后导入 ctr 或上传到 Harbor:docker save -o csi.tar kodoproduct/csi-plugin.storage.qiniu.com:v0.2.0ctr -n k8s.io images import csi.tar
6.2 部署 BGE-M3 时常见问题
⚠️ 问题 3: GPU 共享环境变量
解决方案: 添加 GPU 环境变量
env:
- name: CUDA_VISIBLE_DEVICES
value: "0"
⚠️ 问题 4: 镜像准备
确保将 bge-m3 镜像导入 ctr 或上传到 Harbor
附录
参数说明
| 参数 | 说明 | 默认值 |
|---|---|---|
| uploadcutoff | 上传切割大小 | 209715200 |
| uploadchunksize | 上传块大小 | 5242880 |
| uploadconcurrency | 上传并发数 | 4 |
| vfscachemode | VFS 缓存模式 | off |
| dircacheduration | 目录缓存时长 | 5m0s |
| buffersize | 缓冲区大小 | 16777216 |
| transfers | 传输并发数 | 4 |
常用命令
# 查看 CSI 插件状态
kubectl get pods -n test-s3
# 查看 PV/PVC 状态
kubectl get pv,pvc -n test-s3
# 查看 StorageClass
kubectl get sc
# 查看日志
kubectl logs -n test-s3 <pod-name>
# 删除资源
kubectl delete -f <yaml-file>
文档版本: v1.0
最后更新: 2024