Skip to content

Commit b8050b9

Browse files
EFA daemonset changes for Neuron instances
1 parent b40c9fc commit b8050b9

File tree

7 files changed

+182
-20
lines changed

7 files changed

+182
-20
lines changed

scripts/validate-charts.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@ for d in */; do
1515
if [ -f ${STABLE}/${d}/ci/extra_args ]; then
1616
EXTRA_ARGS=$(cat ${STABLE}/${d}/ci/extra_args)
1717
fi
18+
19+
# Efa device plugin specific validation: check for duplicate instances in both lists
20+
if [ "${d}" = "aws-efa-k8s-device-plugin/" ]; then
21+
echo "Running aws-efa-k8s-device-plugin validation for ${d}"
22+
if [ -f ${STABLE}/${d}/scripts/validate-instance-lists.sh ]; then
23+
cd ${STABLE}/${d}
24+
bash scripts/validate-instance-lists.sh || FAILED+=("${d} (instance-lists)")
25+
cd ${STABLE}
26+
fi
27+
fi
28+
1829
echo "Validating chart ${d} w/ helm"
1930
helm template ${STABLE}/${d} ${EXTRA_ARGS}| kubeval --strict --ignore-missing-schemas || FAILED+=("${d}")
2031
done

stable/aws-efa-k8s-device-plugin/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
apiVersion: v1
22
name: aws-efa-k8s-device-plugin
33
description: A Helm chart for EFA device plugin.
4-
version: v0.5.14
5-
appVersion: "v0.5.8"
4+
version: v0.5.15
5+
appVersion: "v0.5.9"
66
home: https://github.com/aws/eks-charts
77
icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png
88
sources:

stable/aws-efa-k8s-device-plugin/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ Parameter | Description | Default
2323
--- | --- | ---
2424
`image.repository` | EFA image repository | `602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin`
2525
`image.tag` | EFA image tag | `v0.5.8`
26-
`securityContext.allowPrivilegeEscalation` | Controls whether a process can gain more privilege than its parent process | `false`
27-
`securityContext` | EFA plugin security context | `capabilities: drop: ["ALL"] runAsNonRoot: false`
26+
`securityContext` | EFA plugin security context for standard instances | `allowPrivilegeEscalation: false capabilities.drop: ["ALL"] runAsNonRoot: false`
27+
`privilegedSecurityContext` | EFA plugin security context for privileged instances | `allowPrivilegeEscalation: true privileged: true runAsNonRoot: false runAsUser: 0`
28+
`privilegedSupportedInstanceLabels.keys` | Kubernetes key to interpret as privileged instance type | `node.kubernetes.io/instance-type`
29+
`privilegedSupportedInstanceLabels.values` | List of EFA supported instances which require privileged security context | `["inf1.24xlarge", "trn1.32xlarge", "trn1n.32xlarge", "trn2.48xlarge", "trn2u.48xlarge"]`
2830
`supportedInstanceLabels.keys` | Kubernetes key to interpret as instance type | `nodes.kubernetes.io/instance-type`
2931
`supportedInstanceLabels.values` | List of instances which currently support EFA devices | `see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types`
3032
`resources` | Resources for containers in pod | `requests.cpu: 10m requests.memory: 20Mi`
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Validate that no instance appears in both supportedInstanceLabels and privilegedSupportedInstanceLabels
5+
6+
CHART_DIR="$(dirname "$0")/.."
7+
VALUES_FILE="$CHART_DIR/values.yaml"
8+
9+
if [ ! -f "$VALUES_FILE" ]; then
10+
echo "ERROR: values.yaml not found at $VALUES_FILE"
11+
exit 1
12+
fi
13+
14+
# Extract instance lists using yq
15+
STANDARD_INSTANCES=$(yq '.supportedInstanceLabels.values[]' "$VALUES_FILE" 2>/dev/null || echo "")
16+
PRIVILEGED_INSTANCES=$(yq '.privilegedSupportedInstanceLabels.values[]' "$VALUES_FILE" 2>/dev/null || echo "")
17+
18+
if [ -z "$STANDARD_INSTANCES" ] && [ -z "$PRIVILEGED_INSTANCES" ]; then
19+
echo "WARNING: Could not extract instance lists. Ensure yq is installed."
20+
exit 0
21+
fi
22+
23+
# Check for duplicates
24+
DUPLICATES=""
25+
for instance in $PRIVILEGED_INSTANCES; do
26+
if echo "$STANDARD_INSTANCES" | grep -q "^$instance$"; then
27+
DUPLICATES="$DUPLICATES $instance"
28+
fi
29+
done
30+
31+
if [ -n "$DUPLICATES" ]; then
32+
echo "ERROR: The following instances appear in both supportedInstanceLabels and privilegedSupportedInstanceLabels:"
33+
for dup in $DUPLICATES; do
34+
echo " - $dup"
35+
done
36+
echo ""
37+
echo "Each instance should appear in exactly one list to prevent dual DaemonSets."
38+
exit 1
39+
fi
40+
41+
echo "✅ Validation passed: No instances appear in both lists"
42+
exit 0
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
{{- if .Values.privilegedSupportedInstanceLabels.values }}
2+
apiVersion: apps/v1
3+
kind: DaemonSet
4+
metadata:
5+
name: {{ include "aws-efa-k8s-device-plugin.fullname" . }}-privileged
6+
labels:
7+
{{- include "aws-efa-k8s-device-plugin.labels" . | nindent 4 }}
8+
spec:
9+
selector:
10+
matchLabels:
11+
name: {{ include "aws-efa-k8s-device-plugin.fullname" . }}
12+
updateStrategy:
13+
type: RollingUpdate
14+
template:
15+
metadata:
16+
{{- if .Values.additionalPodAnnotations }}
17+
annotations:
18+
{{- with .Values.additionalPodAnnotations }}
19+
{{- toYaml . | nindent 8 }}
20+
{{- end }}
21+
{{- end }}
22+
labels:
23+
name: {{ include "aws-efa-k8s-device-plugin.fullname" . }}
24+
{{- with .Values.additionalPodLabels }}
25+
{{- toYaml . | nindent 8 }}
26+
{{- end }}
27+
spec:
28+
{{- with .Values.imagePullSecrets }}
29+
imagePullSecrets:
30+
{{- toYaml . | nindent 8 }}
31+
{{- end }}
32+
automountServiceAccountToken: false
33+
tolerations:
34+
- key: CriticalAddonsOnly
35+
operator: Exists
36+
{{- with .Values.tolerations }}
37+
{{- toYaml . | nindent 8 }}
38+
{{- end }}
39+
# Mark this pod as a critical add-on; when enabled, the critical add-on
40+
# scheduler reserves resources for critical add-on pods so that they can
41+
# be rescheduled after a failure.
42+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
43+
priorityClassName: "system-node-critical"
44+
{{- with .Values.nodeSelector }}
45+
nodeSelector:
46+
{{- toYaml . | nindent 8 }}
47+
{{- end }}
48+
affinity:
49+
nodeAffinity:
50+
requiredDuringSchedulingIgnoredDuringExecution:
51+
nodeSelectorTerms:
52+
{{- range .Values.privilegedSupportedInstanceLabels.keys }}
53+
- matchExpressions:
54+
- key: {{ . }}
55+
operator: In
56+
values:
57+
{{- toYaml $.Values.privilegedSupportedInstanceLabels.values | nindent 20 }}
58+
{{- end }}
59+
- key: eks.amazonaws.com/compute-type
60+
operator: NotIn
61+
values:
62+
- auto
63+
hostNetwork: true
64+
containers:
65+
- image: {{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}
66+
name: aws-efa-k8s-device-plugin
67+
securityContext:
68+
{{- toYaml .Values.privilegedSecurityContext | nindent 12 }}
69+
{{- with .Values.resources }}
70+
resources:
71+
{{- toYaml . | nindent 12 }}
72+
{{- end }}
73+
volumeMounts:
74+
- mountPath: /var/lib/kubelet/device-plugins
75+
name: device-plugin
76+
- mountPath: /dev/infiniband/
77+
name: infiniband-volume
78+
- mountPath: /opt/aws/neuron/
79+
name: neuron-tools
80+
volumes:
81+
- hostPath:
82+
path: /var/lib/kubelet/device-plugins
83+
type: ""
84+
name: device-plugin
85+
- hostPath:
86+
path: /dev/infiniband/
87+
type: ""
88+
name: infiniband-volume
89+
- hostPath:
90+
path: /opt/aws/neuron
91+
type: DirectoryOrCreate
92+
name: neuron-tools
93+
{{- end }}

stable/aws-efa-k8s-device-plugin/templates/daemonset.yaml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,20 +64,22 @@ spec:
6464
- image: {{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}
6565
name: aws-efa-k8s-device-plugin
6666
securityContext:
67-
{{- toYaml .Values.securityContext | nindent 12}}
67+
{{- toYaml .Values.securityContext | nindent 12 }}
6868
{{- with .Values.resources }}
6969
resources:
7070
{{- toYaml . | nindent 12 }}
7171
{{- end }}
7272
volumeMounts:
73-
- name: device-plugin
74-
mountPath: /var/lib/kubelet/device-plugins
75-
- name: infiniband-volume
76-
mountPath: /dev/infiniband/
73+
- mountPath: /var/lib/kubelet/device-plugins
74+
name: device-plugin
75+
- mountPath: /dev/infiniband/
76+
name: infiniband-volume
7777
volumes:
78-
- name: device-plugin
79-
hostPath:
78+
- hostPath:
8079
path: /var/lib/kubelet/device-plugins
81-
- name: infiniband-volume
82-
hostPath:
83-
path: /dev/infiniband/
80+
type: ""
81+
name: device-plugin
82+
- hostPath:
83+
path: /dev/infiniband/
84+
type: ""
85+
name: infiniband-volume

stable/aws-efa-k8s-device-plugin/values.yaml

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
11
image:
22
repository: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin
33
# Overrides the image tag whose default is the chart appVersion.
4-
tag: "v0.5.8"
4+
tag: "v0.5.9-rc1"
5+
# Security context for standard EFA instances
56
securityContext:
67
allowPrivilegeEscalation: false
78
capabilities:
89
drop: ["ALL"]
910
runAsNonRoot: false
11+
# Security context for privileged instances (inf/trn)
12+
privilegedSecurityContext:
13+
allowPrivilegeEscalation: true
14+
privileged: true
15+
runAsNonRoot: false
16+
runAsUser: 0
17+
# EFA supported instances that require privileged security context. Tjis list should only include neuron instances(inf/trn)
18+
privilegedSupportedInstanceLabels:
19+
keys:
20+
- "node.kubernetes.io/instance-type"
21+
values:
22+
- inf1.24xlarge
23+
- trn1.32xlarge
24+
- trn1n.32xlarge
25+
- trn2.48xlarge
26+
- trn2u.48xlarge
1027
supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types
1128
keys:
1229
- "node.kubernetes.io/instance-type"
@@ -144,7 +161,6 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/
144161
- g6e.24xlarge
145162
- g6e.48xlarge
146163
- gr6.8xlarge
147-
- inf1.24xlarge
148164
- p3dn.24xlarge
149165
- p4d.24xlarge
150166
- p4de.24xlarge
@@ -153,10 +169,6 @@ supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/
153169
- p5en.48xlarge
154170
- p6-b200.48xlarge
155171
- p6e-gb200.36xlarge
156-
- trn1.32xlarge
157-
- trn1n.32xlarge
158-
- trn2.48xlarge
159-
- trn2u.48xlarge
160172
- vt1.24xlarge
161173
- hpc6a.48xlarge
162174
- hpc6id.32xlarge

0 commit comments

Comments
 (0)