HAMi安装部署时报错处理归纳

1. pod/hami-device-plugin一直处于CrashLoopBackOff

具体报错信息如下

root@controller01:~# kubectl -n kube-system get pods
NAME                                         READY   STATUS             RESTARTS      AGE
calico-kube-controllers-754966f84c-7nm8r     1/1     Running            3 (17h ago)   3d19h
calico-node-84csz                            1/1     Running            3 (17h ago)   3d19h
coredns-596755dbff-94m7w                     1/1     Running            4 (17h ago)   3d19h
dashboard-metrics-scraper-799d786dbf-swl8b   1/1     Running            4 (17h ago)   3d19h
hami-device-plugin-n9s94                     1/2     CrashLoopBackOff   4 (88s ago)   2m57s
hami-scheduler-cdcc75c6f-s2hp5               2/2     Running            0             2m57s
kubernetes-dashboard-9f8c8b989-f4cqj         1/1     Running            4 (17h ago)   3d19h
metrics-server-5d648558d9-49q98              1/1     Running            5 (17h ago)   3d19h
nfs-client-provisioner-6994b84bd-2w7vr       1/1     Running            4 (17h ago)   3d19h
node-local-dns-2mqv2                         1/1     Running            5 (17h ago)   3d19h

#查看pod/hami-device-plugin-n9s94 日志
root@controller01:~# kubectl -n kube-system describe pod hami-device-plugin-n9s94
...
Events:
  Type     Reason               Age                From               Message
  ----     ------               ----               ----               -------
  Normal   Scheduled            60s                default-scheduler  Successfully assigned kube-system/hami-device-plugin-n9s94 to 172.20.0.21
  Normal   Pulled               58s                kubelet            Container image "projecthami/hami:latest" already present on machine
  Normal   Created              58s                kubelet            Created container vgpu-monitor
  Normal   Started              58s                kubelet            Started container vgpu-monitor
  Warning  FailedPostStartHook  42s (x3 over 59s)  kubelet            Exec lifecycle hook ([/bin/sh -c cp -f /k8s-vgpu/lib/nvidia/* /usr/local/vgpu/]) for Container "device-plugin" in Pod "hami-device-plugin-n9s94_kube-system(70f823fa-d042-400d-abd1-0527921e60a6)" failed - error: command '/bin/sh -c cp -f /k8s-vgpu/lib/nvidia/* /usr/local/vgpu/' exited with 126: , message: "OCI runtime exec failed: exec failed: cannot exec in a stopped container: unknown\r\n"
  Normal   Killing              42s (x3 over 59s)  kubelet            FailedPostStartHook
  Warning  BackOff              26s (x4 over 56s)  kubelet            Back-off restarting failed container
  Normal   Pulled               14s (x4 over 59s)  kubelet            Container image "projecthami/hami:latest" already present on machine
  Normal   Created              14s (x4 over 59s)  kubelet            Created container device-plugin
  Normal   Started              13s (x4 over 59s)  kubelet            Started container device-plugin

分析定位：

root@controller01:~# ll /usr/local/vgpu/
total 636
drwxr-xr-x  3 root root   4096 Aug 28 15:37 ./
drwxr-xr-x 17 root root   4096 Nov 15 16:56 ../
drwxr-xr-x  5 root root   4096 Jan  8 15:27 containers/
-rw-r--r--  1 root root     26 Jan  8 17:20 ld.so.preload
-rwxr-xr-x  1 root root 631456 Jan  8 17:20 libvgpu.so*
root@controller01:~# 
root@controller01:~# cd /usr/local/vgpu/
root@controller01:/usr/local/vgpu# ll
total 636
drwxr-xr-x  3 root root   4096 Aug 28 15:37 ./
drwxr-xr-x 17 root root   4096 Nov 15 16:56 ../
drwxr-xr-x  5 root root   4096 Jan  8 15:27 containers/
-rw-r--r--  1 root root     26 Jan  8 17:20 ld.so.preload
-rwxr-xr-x  1 root root 631456 Jan  8 17:20 libvgpu.so*
root@controller01:/usr/local/vgpu# cd containers/
root@controller01:/usr/local/vgpu/containers# ll
total 20
drwxr-xr-x 5 root root 4096 Jan  8 15:27 ./
drwxr-xr-x 3 root root 4096 Aug 28 15:37 ../

root@controller01:~# kubectl -n kube-system logs hami-device-plugin-n9s94 -c device-plugin
I0114 02:25:43.181802   36787 client.go:53] BuildConfigFromFlags failed for file /root/.kube/config: stat /root/.kube/config: no such file or directory using inClusterConfig
Incorrect Usage: invalid value "false" for flag -v: parse error

NAME:
   NVIDIA Device Plugin - NVIDIA device plugin for Kubernetes

USAGE:
   NVIDIA Device Plugin [global options] command [command options] 

COMMANDS:
   version  Show the version of NVIDIA Device Plugin
   help, h  Shows a list of commands or help for one command

GLOBAL OPTIONS:
   --mig-strategy value                                           the desired strategy for exposing MIG devices on GPUs that support it:
                                                                    [none | single | mixed] (default: "none") [$MIG_STRATEGY]
   --fail-on-init-error                                           fail the plugin if an error is encountered during initialization, otherwise block indefinitely (default: true) [$FAIL_ON_INIT_ERROR]
   --nvidia-driver-root value                                     the root path for the NVIDIA driver installation (typical values are '/' or '/run/nvidia/driver') (default: "/") [$NVIDIA_DRIVER_ROOT]
   --pass-device-specs                                            pass the list of DeviceSpecs to the kubelet on Allocate() (default: false) [$PASS_DEVICE_SPECS]
   --device-list-strategy value [ --device-list-strategy value ]  the desired strategy for passing the device list to the underlying runtime:
                                                                    [envvar | volume-mounts | cdi-annotations] (default: "envvar") [$DEVICE_LIST_STRATEGY]
   --device-id-strategy value                                     the desired strategy for passing device IDs to the underlying runtime:
                                                                    [uuid | index] (default: "uuid") [$DEVICE_ID_STRATEGY]
   --gds-enabled                                                  ensure that containers are started with NVIDIA_GDS=enabled (default: false) [$GDS_ENABLED]
   --mofed-enabled                                                ensure that containers are started with NVIDIA_MOFED=enabled (default: false) [$MOFED_ENABLED]
   --config-file value                                            the path to a config file as an alternative to command line options or environment variables [$CONFIG_FILE]
   --cdi-annotation-prefix value                                  the prefix to use for CDI container annotation keys (default: "cdi.k8s.io/") [$CDI_ANNOTATION_PREFIX]
   --nvidia-ctk-path value                                        the path to use for the nvidia-ctk in the generated CDI specification (default: "/usr/bin/nvidia-ctk") [$NVIDIA_CTK_PATH]
   --container-driver-root value                                  the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications (default: "/driver-root") [$CONTAINER_DRIVER_ROOT]
   -v value                                                       number for the log level verbosity (default: 0)
   --node-name value                                              node name (default: "172.20.0.21") [$NodeName]
   --device-split-count value                                     the number for NVIDIA device split (default: 2) [$DEVICE_SPLIT_COUNT]
   --device-memory-scaling value                                  the ratio for NVIDIA device memory scaling (default: 1) [$DEVICE_MEMORY_SCALING]
   --device-cores-scaling value                                   the ratio for NVIDIA device cores scaling (default: 1) [$DEVICE_CORES_SCALING]
   --disable-core-limit                                           If set, the core utilization limit will be ignored (default: false) [$DISABLE_CORE_LIMIT]
   --resource-name value                                          the name of field for number GPU visible in container (default: "nvidia.com/gpu")
   --help, -h                                                     show help
E0114 02:25:43.184839   36787 main.go:153] invalid value "false" for flag -v: parse error

root@controller01:~# kubectl -n kube-system logs hami-device-plugin-n9s94 -c vgpu-monitor | more
I0114 02:04:27.527629   34952 client.go:53] BuildConfigFromFlags failed for file /root/.kube/config: stat /root/.kube/config: no such file or directory using inClusterConfig
W0114 02:04:27.528191   34952 client_config.go:618] Neither --kubeconfig nor --master was specified.  Using the inClusterConfig.  This might not work.
I0114 02:04:27.528497   34952 metrics.go:353] Initializing metrics for vGPUmonitor
E0114 02:04:32.537271   34952 feedback.go:269] Failed to update container list: env NODE_NAME not set
E0114 02:04:37.541471   34952 feedback.go:269] Failed to update container list: env NODE_NAME not set
E0114 02:04:42.541695   34952 feedback.go:269] Failed to update container list: env NODE_NAME not set
#一直重复“E0114 02:04:42.541695   34952 feedback.go:269] Failed to update container list: env NODE_NAME not set”
...

root@controller01:~# helm -n kube-system uninstall hami
root@controller01:~# rm -rf /usr/local/vgpu
#重新安装HAMi
root@controller01:~# helm install hami hami-charts/hami --set resourceName=nvidia.com/vgpu -n kube-system --version 2.4.0
#还是报一样的错误

根据项目维护人员回复说可能是镜像被污染了

处理方法：

已经issue：https://github.com/Project-HAMi/HAMi/issues/805

升级到HAMi2.4.1后，问题不再存在。

HAMi安装部署时报错处理归纳

https://jiangsanyin.github.io/2025/01/13/HAMi安装部署时报错处理归纳/

作者

sanyinjiang

发布于

2025年1月13日

许可协议

HAMi官方监控文档相关内容分析上一篇

containerd容器引擎服务的安装与使用下一篇