HAMi官方监控文档相关内容分析

背景及环境

1
2
3
#操作系统:Ubuntu 20.04.3 LTS x86_64
#HAMi版本:2.4.1
#Kubernetes版本:1.23.17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
root@controller01:~# kubectl get nodes
NAME STATUS ROLES AGE VERSION
172.20.0.21 Ready master 7d3h v1.23.17

#相关任务负载
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# cat gpu-test5.yaml
apiVersion: v1
kind: Pod
metadata:
name: gpu-test5-01
spec:
restartPolicy: OnFailure
# nodeName: controller01
containers:
- name: gpu-test111
image: 175.6.40.93:8196/k8s-kubekey/ubuntu2004:pytorch2.2.2-classification-example
command:
- python3
- /opt/classification/train.py
resources:
limits:
nvidia.com/vgpu: 2 # requesting 2 vGPUs
nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer)
#nvidia.com/gpumem-percentage: 10 #Each vGPU containers 10% device memory of that GPU. Can not be used with nvidia.com/gpumem

root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# kubectl apply -f gpu-test5.yaml

一、关于ns/kube-system下svc的分析

1
2
3
4
5
6
7
8
9
10
11
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# kubectl -n kube-system get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
dashboard-metrics-scraper ClusterIP 10.68.5.223 <none> 8000/TCP 4d2h
hami-device-plugin-monitor NodePort 10.68.183.209 <none> 31992:31992/TCP 6h35m
hami-scheduler NodePort 10.68.83.62 <none> 443:31998/TCP,31993:31993/TCP 6h35m
kube-dns ClusterIP 10.68.0.2 <none> 53/UDP,53/TCP,9153/TCP 4d2h
kube-dns-upstream ClusterIP 10.68.48.155 <none> 53/UDP,53/TCP 4d2h
kubelet ClusterIP None <none> 10250/TCP,10255/TCP,4194/TCP 4d2h
kubernetes-dashboard NodePort 10.68.176.76 <none> 443:30119/TCP 4d2h
metrics-server ClusterIP 10.68.24.142 <none> 443/TCP 4d2h
node-local-dns ClusterIP None <none> 9253/TCP 4d2h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#准备“使用vGPU的yaml例”
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# cat gpu-test5.yaml
apiVersion: v1
kind: Pod
metadata:
name: gpu-test5-01
spec:
restartPolicy: OnFailure
# nodeName: controller01
containers:
- name: gpu-test111
image: 175.6.40.93:8196/k8s-kubekey/ubuntu2004:pytorch2.2.2-classification-example
command:
- python3
- /opt/classification/train.py
resources:
limits:
nvidia.com/vgpu: 2 # requesting 1 vGPUs
nvidia.com/gpumem: 3000 # Each vGPU contains 3000m device memory (Optional,Integer)
1
2
3
4
5
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# kubectl apply -f gpu-test5.yaml
pod/gpu-test5-01 created
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# kubectl get pods
NAME READY STATUS RESTARTS AGE
gpu-test5-01 1/1 Running 0 4s

1.1 svc/hami-device-plugin-monitor

1.1.1 svc本身

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
root@controller01:~# kubectl -n kube-system get svc hami-device-plugin-monitor -o yaml
apiVersion: v1
kind: Service
metadata:
annotations:
meta.helm.sh/release-name: hami
meta.helm.sh/release-namespace: kube-system
creationTimestamp: "2025-01-14T02:34:24Z"
labels:
app.kubernetes.io/component: hami-device-plugin
app.kubernetes.io/instance: hami
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: hami
app.kubernetes.io/version: 2.4.1
helm.sh/chart: hami-2.4.1
name: hami-device-plugin-monitor
namespace: kube-system
resourceVersion: "737654"
uid: 167789e8-f5a4-483a-ab5c-a1d154c8337c
spec:
clusterIP: 10.68.183.209
clusterIPs:
- 10.68.183.209
externalTrafficPolicy: Local
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: monitorport
nodePort: 31992
port: 31992
protocol: TCP
targetPort: 9394
selector:
app.kubernetes.io/component: hami-device-plugin
sessionAffinity: None
type: NodePort
status:
loadBalancer: {}

1.1.2 监控指标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
###hami-device-plugin-monitor 的监控指标,浏览器访问:http://172.20.0.21:31992/metrics
###返回结果如下

# HELP Device_last_kernel_of_container Container device last kernel description
# TYPE Device_last_kernel_of_container gauge
Device_last_kernel_of_container{ctrname="gpu-test111",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",podname="gpu-test5-01",podnamespace="default",vdeviceid="1",zone="vGPU"} 0
Device_last_kernel_of_container{ctrname="gpu-test111",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",podname="gpu-test5-01",podnamespace="default",vdeviceid="0",zone="vGPU"} 0
# HELP Device_memory_desc_of_container Container device meory description
# TYPE Device_memory_desc_of_container counter
Device_memory_desc_of_container{context="0",ctrname="gpu-test111",data="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",module="0",offset="0",podname="gpu-test5-01",podnamespace="default",vdeviceid="1",zone="vGPU"} 0
Device_memory_desc_of_container{context="272629760",ctrname="gpu-test111",data="453545472",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",module="0",offset="0",podname="gpu-test5-01",podnamespace="default",vdeviceid="0",zone="vGPU"} 7.26175232e+08
# HELP Device_utilization_desc_of_container Container device utilization description
# TYPE Device_utilization_desc_of_container gauge
Device_utilization_desc_of_container{ctrname="gpu-test111",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",podname="gpu-test5-01",podnamespace="default",vdeviceid="1",zone="vGPU"} 0
Device_utilization_desc_of_container{ctrname="gpu-test111",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",podname="gpu-test5-01",podnamespace="default",vdeviceid="0",zone="vGPU"} 0
# HELP HostCoreUtilization GPU core utilization
# TYPE HostCoreUtilization gauge
HostCoreUtilization{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",zone="vGPU"} 17
HostCoreUtilization{deviceidx="1",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",zone="vGPU"} 0
# HELP HostGPUMemoryUsage GPU device memory usage
# TYPE HostGPUMemoryUsage gauge
HostGPUMemoryUsage{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",zone="vGPU"} 1.49651456e+09
HostGPUMemoryUsage{deviceidx="1",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",zone="vGPU"} 7.00317696e+08
# HELP vGPU_device_memory_limit_in_bytes vGPU device limit
# TYPE vGPU_device_memory_limit_in_bytes gauge
vGPU_device_memory_limit_in_bytes{ctrname="gpu-test111",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",podname="gpu-test5-01",podnamespace="default",vdeviceid="1",zone="vGPU"} 3.145728e+09
vGPU_device_memory_limit_in_bytes{ctrname="gpu-test111",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",podname="gpu-test5-01",podnamespace="default",vdeviceid="0",zone="vGPU"} 3.145728e+09
# HELP vGPU_device_memory_usage_in_bytes vGPU device usage
# TYPE vGPU_device_memory_usage_in_bytes gauge
vGPU_device_memory_usage_in_bytes{ctrname="gpu-test111",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",podname="gpu-test5-01",podnamespace="default",vdeviceid="1",zone="vGPU"} 0
vGPU_device_memory_usage_in_bytes{ctrname="gpu-test111",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",podname="gpu-test5-01",podnamespace="default",vdeviceid="0",zone="vGPU"} 7.26175232e+08

1.1.3 对应的smon

此servicemonitor由自己进行封装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#传统方式部署prometheus时,是基于prometheus.yaml来配置监控目标的,如果需要配置的内容多就会产生可读性问题
# 所以后面有了服务发现即增加file_sd_config等,来以减少prometheus.yaml中的配置内容
#在k8s中,servicemonitor就是承担了“服务发现”的类似的作用。就是说如果没有smon,可以通过hami自身部署在ns/kube-system中的svc/hami-device-plugin-monitor查看到相应的监控数据,但不会被保存到prometheus时序数据库中(可以通过下面两张截图得到验证,可以查看截图中在相关时间范围内检索到的数据:16:35左右将smon/hami-device-plugin-svc-monitor删除,然后在16:55左右又重新创建出来。)
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# ll hami-device-plugin-svc-monitor.yaml
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# cat hami-device-plugin-svc-monitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: hami-device-plugin-svc-monitor
namespace: kube-system
spec:
selector:
matchLabels:
app.kubernetes.io/component: hami-device-plugin
namespaceSelector:
matchNames:
- "kube-system"
endpoints:
- path: /metrics
port: monitorport
interval: "15s"
honorLabels: false
relabelings:
- sourceLabels: [__meta_kubernetes_endpoints_name]
regex: hami-.*
replacement: $1
action: keep
- sourceLabels: [__meta_kubernetes_pod_node_name]
regex: (.*)
targetLabel: node_name
replacement: ${1}
action: replace
- sourceLabels: [__meta_kubernetes_pod_host_ip]
regex: (.*)
targetLabel: ip
replacement: $1
action: replace


root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# kubectl -n kube-system get smon hami-device-plugin-svc-monitor -o yaml

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"monitoring.coreos.com/v1","kind":"ServiceMonitor","metadata":{"annotations":{},"name":"hami-device-plugin-svc-monitor","namespace":"kube-system"},"spec":{"endpoints":[{"honorLabels":false,"interval":"15s","path":"/metrics","port":"monitorport","relabelings":[{"action":"keep","regex":"hami-.*","replacement":"$1","sourceLabels":["__meta_kubernetes_endpoints_name"]},{"action":"replace","regex":"(.*)","replacement":"${1}","sourceLabels":["__meta_kubernetes_pod_node_name"],"targetLabel":"node_name"},{"action":"replace","regex":"(.*)","replacement":"$1","sourceLabels":["__meta_kubernetes_pod_host_ip"],"targetLabel":"ip"}]}],"namespaceSelector":{"matchNames":["kube-system"]},"selector":{"matchLabels":{"app.kubernetes.io/component":"hami-device-plugin"}}}}
creationTimestamp: "2025-01-14T06:40:10Z"
generation: 1
name: hami-device-plugin-svc-monitor
namespace: kube-system
resourceVersion: "764614"
uid: 8eab0c04-7047-4811-af8a-425df908b797
spec:
endpoints:
- honorLabels: false
interval: 15s
path: /metrics
port: monitorport
relabelings:
- action: keep
regex: hami-.*
replacement: $1
sourceLabels:
- __meta_kubernetes_endpoints_name
- action: replace
regex: (.*)
replacement: ${1}
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: node_name
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_host_ip
targetLabel: ip
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/component: hami-device-plugin
image-20250116165340426
image-20250116165713613

1.2 svc/hami-scheduler

1.2.1 svc本身

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
root@controller01:~# kubectl -n kube-system get svc hami-scheduler -o yaml
apiVersion: v1
kind: Service
metadata:
annotations:
meta.helm.sh/release-name: hami
meta.helm.sh/release-namespace: kube-system
creationTimestamp: "2025-01-14T02:34:24Z"
labels:
app.kubernetes.io/component: hami-scheduler
app.kubernetes.io/instance: hami
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: hami
app.kubernetes.io/version: 2.4.1
helm.sh/chart: hami-2.4.1
name: hami-scheduler
namespace: kube-system
resourceVersion: "737655"
uid: 6b1d223f-6d80-4ae8-a930-7878193dc200
spec:
clusterIP: 10.68.83.62
clusterIPs:
- 10.68.83.62
externalTrafficPolicy: Cluster
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: http
nodePort: 31998
port: 443
protocol: TCP
targetPort: 443
- name: monitor
nodePort: 31993
port: 31993
protocol: TCP
targetPort: 9395
selector:
app.kubernetes.io/component: hami-scheduler
app.kubernetes.io/instance: hami
app.kubernetes.io/name: hami
sessionAffinity: None
type: NodePort
status:
loadBalancer: {}

1.2.2 监控指标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
###hami-scheduler 的监控指标,浏览器访问:http://172.20.0.21:31993/metrics
###返回结果如下

# HELP GPUDeviceCoreAllocated Device core allocated for a certain GPU
# TYPE GPUDeviceCoreAllocated gauge
GPUDeviceCoreAllocated{deviceidx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",zone="vGPU"} 0
GPUDeviceCoreAllocated{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",zone="vGPU"} 0
# HELP GPUDeviceCoreLimit Device memory core limit for a certain GPU
# TYPE GPUDeviceCoreLimit gauge
GPUDeviceCoreLimit{deviceidx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",zone="vGPU"} 100
GPUDeviceCoreLimit{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",zone="vGPU"} 100
# HELP GPUDeviceMemoryAllocated Device memory allocated for a certain GPU
# TYPE GPUDeviceMemoryAllocated gauge
GPUDeviceMemoryAllocated{devicecores="0",deviceidx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",zone="vGPU"} 3.145728e+09
GPUDeviceMemoryAllocated{devicecores="0",deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",zone="vGPU"} 3.145728e+09
# HELP GPUDeviceMemoryLimit Device memory limit for a certain GPU
# TYPE GPUDeviceMemoryLimit gauge
GPUDeviceMemoryLimit{deviceidx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",zone="vGPU"} 4.8305799168e+10
GPUDeviceMemoryLimit{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",zone="vGPU"} 4.8305799168e+10
# HELP GPUDeviceSharedNum Number of containers sharing this GPU
# TYPE GPUDeviceSharedNum gauge
GPUDeviceSharedNum{deviceidx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",zone="vGPU"} 1
GPUDeviceSharedNum{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",zone="vGPU"} 1
# HELP nodeGPUMemoryPercentage GPU Memory Allocated Percentage on a certain GPU
# TYPE nodeGPUMemoryPercentage gauge
nodeGPUMemoryPercentage{deviceidx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",zone="vGPU"} 0.06512112529304506
nodeGPUMemoryPercentage{deviceidx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",zone="vGPU"} 0.06512112529304506
# HELP nodeGPUOverview GPU overview on a certain node
# TYPE nodeGPUOverview gauge
nodeGPUOverview{devicecores="0",deviceidx="0",devicememorylimit="46068",devicetype="NVIDIA-NVIDIA A40",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodeid="172.20.0.21",sharedcontainers="1",zone="vGPU"} 3.145728e+09
nodeGPUOverview{devicecores="0",deviceidx="0",devicememorylimit="46068",devicetype="NVIDIA-NVIDIA A40",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodeid="172.20.0.21",sharedcontainers="1",zone="vGPU"} 3.145728e+09
# HELP vGPUCorePercentage vGPU core allocated from a container
# TYPE vGPUCorePercentage gauge
vGPUCorePercentage{containeridx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodename="172.20.0.21",podname="gpu-test5-01",podnamespace="default",zone="vGPU"} 0
vGPUCorePercentage{containeridx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodename="172.20.0.21",podname="gpu-test5-01",podnamespace="default",zone="vGPU"} 0
# HELP vGPUMemoryPercentage vGPU memory percentage allocated from a container
# TYPE vGPUMemoryPercentage gauge
vGPUMemoryPercentage{containeridx="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodename="172.20.0.21",podname="gpu-test5-01",podnamespace="default",zone="vGPU"} 0.06512112529304506
vGPUMemoryPercentage{containeridx="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodename="172.20.0.21",podname="gpu-test5-01",podnamespace="default",zone="vGPU"} 0.06512112529304506
# HELP vGPUPodsDeviceAllocated vGPU Allocated from pods
# TYPE vGPUPodsDeviceAllocated gauge
vGPUPodsDeviceAllocated{containeridx="0",deviceusedcore="0",deviceuuid="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",nodename="172.20.0.21",podname="gpu-test5-01",podnamespace="default",zone="vGPU"} 3.145728e+09
vGPUPodsDeviceAllocated{containeridx="0",deviceusedcore="0",deviceuuid="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",nodename="172.20.0.21",podname="gpu-test5-01",podnamespace="default",zone="vGPU"} 3.145728e+09

1.2.3 对应的smon

此servicemonitor由自己进行封装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# ll hami-scheduler-svc-monitor.yaml  
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# cat hami-scheduler-svc-monitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: hami-scheduler-svc-monitor
namespace: kube-system
spec:
selector:
matchLabels:
app.kubernetes.io/component: hami-scheduler
namespaceSelector:
matchNames:
- "kube-system"
endpoints:
- path: /metrics
port: monitor
interval: "15s"
honorLabels: false
relabelings:
- sourceLabels: [__meta_kubernetes_endpoints_name]
regex: hami-.*
replacement: $1
action: keep
- sourceLabels: [__meta_kubernetes_pod_node_name]
regex: (.*)
targetLabel: node_name
replacement: ${1}
action: replace
- sourceLabels: [__meta_kubernetes_pod_host_ip]
regex: (.*)
targetLabel: ip
replacement: $1
action: replace



root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# kubectl -n kube-system get smon hami-scheduler-svc-monitor -o yaml

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"monitoring.coreos.com/v1","kind":"ServiceMonitor","metadata":{"annotations":{},"name":"hami-scheduler-svc-monitor","namespace":"kube-system"},"spec":{"endpoints":[{"honorLabels":false,"interval":"15s","path":"/metrics","port":"monitor","relabelings":[{"action":"keep","regex":"hami-.*","replacement":"$1","sourceLabels":["__meta_kubernetes_endpoints_name"]},{"action":"replace","regex":"(.*)","replacement":"${1}","sourceLabels":["__meta_kubernetes_pod_node_name"],"targetLabel":"node_name"},{"action":"replace","regex":"(.*)","replacement":"$1","sourceLabels":["__meta_kubernetes_pod_host_ip"],"targetLabel":"ip"}]}],"namespaceSelector":{"matchNames":["kube-system"]},"selector":{"matchLabels":{"app.kubernetes.io/component":"hami-scheduler"}}}}
creationTimestamp: "2025-01-14T06:42:15Z"
generation: 1
name: hami-scheduler-svc-monitor
namespace: kube-system
resourceVersion: "764901"
uid: ba0dc589-4ebe-4fb8-aa11-978f5288adbd
spec:
endpoints:
- honorLabels: false
interval: 15s
path: /metrics
port: monitor
relabelings:
- action: keep
regex: hami-.*
replacement: $1
sourceLabels:
- __meta_kubernetes_endpoints_name
- action: replace
regex: (.*)
replacement: ${1}
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: node_name
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_host_ip
targetLabel: ip
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/component: hami-scheduler

1.3 发现问题

  • 进入pod内容器中执行nvidia-smi命令输出正常,即可以看到两个“NVIDIA GPU”,显存是3000M

  • 查看pod日志正常

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi/deploy-prometheus# kubectl logs gpu-test5-01 | more
[HAMI-core Msg(1:139687502968448:libvgpu.c:836)]: Initializing.....
2025-01-15 07:46:41.274 | INFO | __main__:<module>:97 - Namespace(batch_size=32, epochs=10, lr=0.01, save_dir='./output', train_data_dir='./data/cifar10/train', val_data_dir='./data/cifar10/val', weights=None)
[HAMI-core Msg(1:139687502968448:libvgpu.c:855)]: Initialized
2025-01-15 07:46:41.588 | INFO | __main__:main:21 - Using device: cuda
2025-01-15 07:46:41.995 | INFO | __main__:main:50 - Epoch: 1/10
2025-01-15 07:46:41.996 | INFO | __main__:main:52 - Training
[HAMI-core Msg(1:139687502968448:memory.c:511)]: orig free=47283240960 total=47608692736 limit=3145728000 usage=310526720
...
2025-01-15 07:46:42.572 | INFO | __main__:main:60 - Step: 1 Loss: 2.253140
[HAMI-core Msg(1:139682238363392:memory.c:511)]: orig free=47088205824 total=47608692736 limit=3145728000 usage=451448320
...
2025-01-15 07:46:42.920 | INFO | __main__:main:60 - Step: 2 Loss: 2.400440
2025-01-15 07:46:42.944 | INFO | __main__:main:60 - Step: 3 Loss: 2.344008
2025-01-15 07:46:42.964 | INFO | __main__:main:60 - Step: 4 Loss: 2.309544
...
2025-01-15 07:52:53.087 | INFO | __main__:main:60 - Step: 1561 Loss: 0.496849
2025-01-15 07:52:53.107 | INFO | __main__:main:60 - Step: 1562 Loss: 0.342890
2025-01-15 07:52:53.128 | INFO | __main__:main:60 - Step: 1563 Loss: 0.425614
2025-01-15 07:52:53.178 | INFO | __main__:main:64 - Evaluating
2025-01-15 07:52:56.328 | INFO | __main__:main:74 - Accuracy: 7591/10000
2025-01-15 07:52:56.328 | INFO | __main__:main:79 - Saving model weights to ./output/epoch10.pth
2025-01-15 07:52:56.430 | INFO | __main__:main:84 - Done
[HAMI-core Msg(1:139687502968448:multiprocess_memory_limit.c:497)]: Calling exit handler 1
  • Device_memory_desc_of_container 监控数据正常
  • Device_utilization_desc_of_container 监控数据不正常,一直为0
image-20250115155308205

二、关于ns/monitoring下svc的分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
root@controller01:~# kubectl -n monitoring get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager-main NodePort 10.68.143.71 <none> 9093:30854/TCP,8080:30320/TCP 5d
alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 5d
blackbox-exporter ClusterIP 10.68.0.167 <none> 9115/TCP,19115/TCP 5d
dcgm-exporter ClusterIP 10.68.32.112 <none> 9400/TCP 25h
grafana NodePort 10.68.79.63 <none> 3000:30300/TCP 5d
kube-state-metrics ClusterIP None <none> 8443/TCP,9443/TCP 5d
node-exporter ClusterIP None <none> 9100/TCP 5d
prometheus-adapter ClusterIP 10.68.33.74 <none> 443/TCP 5d
prometheus-k8s NodePort 10.68.168.213 <none> 9090:31819/TCP,8080:31903/TCP 5d
prometheus-operated ClusterIP None <none> 9090/TCP 5d
prometheus-operator ClusterIP None <none> 8443/TCP 5d
#着重查看svc/dcgm-exporter

2.1 svc/dcgm-exporter 本身

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
root@controller01:~# kubectl -n monitoring get svc dcgm-exporter -o yaml
apiVersion: v1
kind: Service
metadata:
annotations:
meta.helm.sh/release-name: dcgm-exporter
meta.helm.sh/release-namespace: monitoring
creationTimestamp: "2025-01-14T06:37:49Z"
labels:
app.kubernetes.io/component: dcgm-exporter
app.kubernetes.io/instance: dcgm-exporter
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: dcgm-exporter
app.kubernetes.io/version: 3.1.7
helm.sh/chart: dcgm-exporter-3.1.7
name: dcgm-exporter
namespace: monitoring
resourceVersion: "764255"
uid: ebe9c178-4252-43af-9ef5-01682bb7aceb
spec:
clusterIP: 10.68.32.112
clusterIPs:
- 10.68.32.112
internalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: metrics
port: 9400
protocol: TCP
targetPort: 9400
selector:
app.kubernetes.io/instance: dcgm-exporter
app.kubernetes.io/name: dcgm-exporter
sessionAffinity: None
type: ClusterIP
status:
loadBalancer: {}

2.2 svc/dcgm-exporter 监控

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#从上面可以看到svc/dcgm-exporter 的类型是ClusterIP,它的工作端口是9400
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# curl 10.68.32.112:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 1740
DCGM_FI_DEV_SM_CLOCK{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 210
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 7250
DCGM_FI_DEV_MEM_CLOCK{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 405
# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C).
# TYPE DCGM_FI_DEV_GPU_TEMP gauge
DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 66
DCGM_FI_DEV_GPU_TEMP{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 37
# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W).
# TYPE DCGM_FI_DEV_POWER_USAGE gauge
DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 124.967000
DCGM_FI_DEV_POWER_USAGE{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 30.280000
# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ).
# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 7187773583
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 7015882815
# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries.
# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter
DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %).
# TYPE DCGM_FI_DEV_GPU_UTIL gauge
DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 16
DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %).
# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge
DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 8
DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %).
# TYPE DCGM_FI_DEV_ENC_UTIL gauge
DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_ENC_UTIL{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %).
# TYPE DCGM_FI_DEV_DEC_UTIL gauge
DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_DEC_UTIL{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered.
# TYPE DCGM_FI_DEV_XID_ERRORS gauge
DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_XID_ERRORS{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB).
# TYPE DCGM_FI_DEV_FB_FREE gauge
DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 44718
DCGM_FI_DEV_FB_FREE{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 45400
# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB).
# TYPE DCGM_FI_DEV_FB_USED gauge
DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 684
DCGM_FI_DEV_FB_USED{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 3
# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes.
# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status
# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge
DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Number of remapped rows for uncorrectable errors
# TYPE DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS counter
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Number of remapped rows for correctable errors
# TYPE DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS counter
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_DEV_ROW_REMAP_FAILURE Whether remapping of rows has failed
# TYPE DCGM_FI_DEV_ROW_REMAP_FAILURE gauge
DCGM_FI_DEV_ROW_REMAP_FAILURE{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
DCGM_FI_DEV_ROW_REMAP_FAILURE{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0
# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active (in %).
# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge
DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0.160959
DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0.000000
# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active (in %).
# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0.010100
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0.000000
# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data (in %).
# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge
DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0.061415
DCGM_FI_PROF_DRAM_ACTIVE{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 0.000325
# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge
DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 6811221
DCGM_FI_PROF_PCIE_TX_BYTES{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 1837182
# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge
DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-cfca7d85-be74-7c22-4385-6fd15d698cb4",device="nvidia0",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 40172789
DCGM_FI_PROF_PCIE_RX_BYTES{gpu="1",UUID="GPU-c10058bc-1eae-1b32-ba0d-85d26c9ed9ff",device="nvidia1",modelName="NVIDIA A40",Hostname="172.20.0.21",DCGM_FI_DRIVER_VERSION="550.54.15",container="",namespace="",pod=""} 1166047
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi#

2.3 对应的smon/dcgm-exporter

此smon是在部署dcgm组件时,由其自动创建的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# kubectl -n monitoring get smon
NAME AGE
...
dcgm-exporter 26h
...

#查看smon/dcgm-exporter 本身的内容
root@controller01:/opt/installPkgs/k8s-vgpu-basedon-HAMi# kubectl -n monitoring get smon dcgm-exporter -o yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
annotations:
meta.helm.sh/release-name: dcgm-exporter
meta.helm.sh/release-namespace: monitoring
creationTimestamp: "2025-01-14T06:37:49Z"
generation: 1
labels:
app.kubernetes.io/component: dcgm-exporter
app.kubernetes.io/instance: dcgm-exporter
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: dcgm-exporter
app.kubernetes.io/version: 3.1.7
helm.sh/chart: dcgm-exporter-3.1.7
name: dcgm-exporter
namespace: monitoring
resourceVersion: "764260"
uid: cb9c11ee-6702-48b6-9780-52b15c3e78a9
spec:
endpoints:
- honorLabels: false
interval: 15s
path: /metrics
port: metrics
relabelings:
- action: keep
regex: dcgm-exporter
replacement: $1
sourceLabels:
- __meta_kubernetes_endpoints_name
- action: replace
regex: (.*)
replacement: ${1}
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: node_name
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_host_ip
targetLabel: ip
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
app.kubernetes.io/component: dcgm-exporter
app.kubernetes.io/instance: dcgm-exporter
app.kubernetes.io/name: dcgm-exporter

2.4 问题处理

2.4.1 Device_utilization_desc_of_container 监控数据不正常,一直为0

image-20250123105319763

处理办法:

先前没有在任务中加“nvidia.com/gpucores: xxx” 发现Device_utilization_desc_of_container一直为0。不需要显式加 hostPid,声明了“nvidia.com/gpucores: xxx”Device_utilization_desc_of_container 就有监控数据了

相关源码:https://github.com/Project-HAMi/HAMi-core/blob/88dd82117e057b88d5c3c19a546002838242a64f/src/multiprocess/multiprocess_utilization_watcher.c#L223

hami-core里面只有在 sm_limit 大于 0 小于等于 100 时才启动监控线程,然后 https://github.com/Project-HAMi/HAMi/blob/4b1cda2ea77107826e0d6836f7b2f2dd1dbfb3b7/charts/hami/templates/scheduler/device-configmap.yaml#L21 这个 nvidia.defaultCores: 0 不写 nvidia.com/gpucores 就是 默认 0,是需要显示声明算力限制才会有对应的监控


HAMi官方监控文档相关内容分析
https://jiangsanyin.github.io/2025/01/14/HAMi官方监控文档相关内容分析/
作者
sanyinjiang
发布于
2025年1月14日
许可协议