Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ endif
OPERATOR_SDK_VERSION ?= v1.36.1

# Image URL to use all building/pushing image targets
IMG ?= slinky.slurm.net/slurm-operator:$(VERSION)
IMG ?= ebennerv/slurm-operator:$(VERSION)

# Set the namespaces that helm tests will run against
SLURM_NAMESPACE ?= slurm
Expand Down
8 changes: 4 additions & 4 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ import (
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

slinkyv1alpha1 "github.com/SlinkyProject/slurm-operator/api/v1alpha1"
"github.com/SlinkyProject/slurm-operator/internal/controller/cluster"
"github.com/SlinkyProject/slurm-operator/internal/controller/nodeset"
"github.com/SlinkyProject/slurm-operator/internal/resources"
slinkyv1alpha1 "github.com/togethercomputer/slurm-operator/api/v1alpha1"
"github.com/togethercomputer/slurm-operator/internal/controller/cluster"
"github.com/togethercomputer/slurm-operator/internal/controller/nodeset"
"github.com/togethercomputer/slurm-operator/internal/resources"
//+kubebuilder:scaffold:imports
)

Expand Down
2 changes: 1 addition & 1 deletion cmd/webhook/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"

slinkyv1alpha1 "github.com/SlinkyProject/slurm-operator/api/v1alpha1"
slinkyv1alpha1 "github.com/togethercomputer/slurm-operator/api/v1alpha1"
//+kubebuilder:scaffold:imports
)

Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
module github.com/SlinkyProject/slurm-operator
module github.com/togethercomputer/slurm-operator

go 1.22.0

toolchain go1.22.5
go 1.25.0

require (
github.com/SlinkyProject/slurm-client v0.1.0-20241108145209-7bfdd4923f75
Expand Down Expand Up @@ -126,3 +124,5 @@ require (
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)

replace k8s.io/kubernetes => k8s.io/kubernetes v1.30.3
2 changes: 2 additions & 0 deletions helm/slurm-operator/templates/operator/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ spec:
hostname: {{ include "slurm-operator.name" . }}
priorityClassName: {{ default "" .Values.priorityClassName }}
serviceAccountName: {{ include "slurm-operator.operator.serviceAccountName" . }}
tolerations: {{- toYaml .Values.operator.tolerations | nindent 6 }}
affinity: {{- toYaml .Values.operator.affinity | nindent 6 }}
{{- include "slurm-operator.imagePullSecrets" . | nindent 6 }}
containers:
- name: slurm-operator
Expand Down
1 change: 1 addition & 0 deletions helm/slurm-operator/templates/operator/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ rules:
- get
- list
- watch
- update
- apiGroups:
- ""
resources:
Expand Down
2 changes: 2 additions & 0 deletions helm/slurm-operator/templates/webhook/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ spec:
hostname: {{ include "slurm-operator.webhook.name" . }}
priorityClassName: {{ default "" .Values.priorityClassName }}
serviceAccountName: {{ include "slurm-operator.webhook.serviceAccountName" . }}
tolerations: {{- toYaml .Values.webhook.tolerations | nindent 6 }}
affinity: {{- toYaml .Values.webhook.affinity | nindent 6 }}
{{- include "slurm-operator.imagePullSecrets" . | nindent 6 }}
containers:
- name: webhook
Expand Down
5 changes: 5 additions & 0 deletions helm/slurm-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ operator:
# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
affinity: {}
#
# -- (list)
# Configure pod tolerations.
# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
tolerations: []
#
# -- (object)
# Set container resource requests and limits for Kubernetes Pod scheduling.
# Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
Expand Down
45 changes: 45 additions & 0 deletions helm/slurm/templates/_slurm.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,27 @@ Define slurmd capabilities
- SYS_NICE
{{- end }}

{{/*
Determine login image repository
*/}}
{{- define "slurm.login.image.repository" -}}
{{- .Values.login.image.repository | default (printf "%s/sackd" (include "slurm.image.repository" .)) -}}
{{- end }}

{{/*
Define login image tag
*/}}
{{- define "slurm.login.image.tag" -}}
{{- .Values.login.image.tag | default (include "slurm.image.tag" .) -}}
{{- end }}

{{/*
Determine login image reference (repo:tag)
*/}}
{{- define "slurm.login.imageRef" -}}
{{- printf "%s:%s" (include "slurm.login.image.repository" .) (include "slurm.login.image.tag" .) | quote -}}
{{- end }}

{{/*
Define restapi name
*/}}
Expand Down Expand Up @@ -300,6 +321,30 @@ Define cluster secret name
{{- printf "%s-token-%s" .Release.Name (include "slurm.user" .) -}}
{{- end }}

{{/*
Define login name
*/}}
{{- define "slurm.login.name" -}}
{{ printf "%s-login" .Release.Name }}
{{- end }}

{{/*
Define login labels
*/}}
{{- define "slurm.login.labels" -}}
app.kubernetes.io/component: login
{{ include "slurm.login.selectorLabels" . }}
{{ include "slurm.labels" . }}
{{- end }}

{{/*
Define login selectorLabels
*/}}
{{- define "slurm.login.selectorLabels" -}}
app.kubernetes.io/name: login
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Define token name
*/}}
Expand Down
4 changes: 4 additions & 0 deletions helm/slurm/templates/accounting/accounting-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ spec:
affinity:
{{- .Values.accounting.affinity | toYaml | nindent 8 }}
{{- end }}{{- /* if .Values.accounting.affinity */}}
{{- if .Values.accounting.tolerations }}
tolerations:
{{- .Values.accounting.tolerations | toYaml | nindent 8 }}
{{- end }}{{- /* if .Values.accounting.tolerations */}}
{{- include "slurm.imagePullSecrets" $ | nindent 6 }}
initContainers:
- name: init
Expand Down
21 changes: 21 additions & 0 deletions helm/slurm/templates/compute/compute-nodeset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ spec:
ports:
- name: slurmd
containerPort: {{ include "slurm.compute.port" $ }}
- name: ssh
containerPort: 22
startupProbe:
exec:
command:
Expand Down Expand Up @@ -135,6 +137,14 @@ spec:
- name: {{ $vct.metadata.name }}
mountPath: {{ $vct.spec.mountPath | default (printf "/mnt/%s" $vct.metadata.name ) }}
{{- end }}{{- /* range $vct := $nodeset.volumeClaimTemplates */}}
- mountPath: /dev/shm
name: dshm
{{- if and $nodeset.persistence $nodeset.persistence.existingDataClaims }}
{{- range $nodeset.persistence.existingDataClaims }}
- name: {{ .name }}
mountPath: {{ .mountPath }}
{{- end }}{{- /* range $nodeset.persistence.existingDataClaims */}}
{{- end }}{{- /* if $nodeset.persistence.existingDataClaims */}}
volumes:
{{- include "slurm.volumes" $ | nindent 8 }}
- name: slurm-config
Expand All @@ -143,6 +153,17 @@ spec:
sources:
- secret:
name: {{ include "slurm.auth.secretName" $ }}
- emptyDir:
medium: Memory
sizeLimit: {{ $nodeset.shmSize }}
name: dshm
{{- if and $nodeset.persistence $nodeset.persistence.existingDataClaims }}
{{- range $nodeset.persistence.existingDataClaims }}
- name: {{ .name }}
persistentVolumeClaim:
claimName: {{ .name }}
{{- end }}{{- /* range $nodeset.persistence.existingDataClaims */}}
{{- end }}{{- /* if $nodeset.persistence.existingDataClaims */}}
{{- if $nodeset.volumeClaimTemplates }}
volumeClaimTemplates:
{{- $nodeset.volumeClaimTemplates | toYaml | nindent 4 }}
Expand Down
4 changes: 4 additions & 0 deletions helm/slurm/templates/controller/controller-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ spec:
affinity:
{{- .Values.controller.affinity | toYaml | nindent 8 }}
{{- end }}{{- /* if .Values.controller.affinity */}}
{{- if .Values.controller.tolerations }}
tolerations:
{{- .Values.controller.tolerations | toYaml | nindent 8 }}
{{- end }}{{- /* if .Values.controller.tolerations */}}
{{- include "slurm.imagePullSecrets" . | nindent 6 }}
{{- if .Values.accounting.enabled }}
securityContext:
Expand Down
105 changes: 105 additions & 0 deletions helm/slurm/templates/login/login-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{{- /*
SPDX-FileCopyrightText: Copyright (C) SchedMD LLC.
SPDX-License-Identifier: Apache-2.0
*/}}

{{- if .Values.login.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "slurm.login.name" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "slurm.login.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.login.replicas | default 1 }}
selector:
matchLabels:
{{- include "slurm.login.selectorLabels" . | nindent 6 }}
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: login
labels:
{{- include "slurm.login.labels" . | nindent 8 }}
spec:
priorityClassName: {{ .Values.login.priorityClassName | default .Values.priorityClassName }}
automountServiceAccountToken: false
enableServiceLinks: false
dnsConfig:
{{- include "slurm.dnsConfig" . | nindent 8 }}
initContainers:
- name: init
image: {{ include "slurm.authcred.imageRef" . }}
imagePullPolicy: {{ .Values.authcred.imagePullPolicy | default (include "slurm.imagePullPolicy" .) }}
{{- with .Values.authcred.resources }}
resources:
{{- toYaml . | nindent 12 }}
{{- end }}{{- /* with .Values.authcred.resources */}}
env:
- name: SLURM_USER
value: {{ include "slurm.user" . }}
command:
- bash
- -c
- |
{{- range .Files.Lines "scripts/init.sh" }}
{{ . }}
{{- end }}{{- /* range .Files.Lines "scripts/init.sh" */}}
volumeMounts:
{{- include "slurm.init.volumeMounts" . | nindent 12 }}
containers:
- name: login
image: {{ include "slurm.login.imageRef" . }}
imagePullPolicy: {{ .Values.login.imagePullPolicy | default (include "slurm.imagePullPolicy" .) }}
{{- with .Values.login.securityContext }}
securityContext:
{{- toYaml . | nindent 12 }}
{{- end }}
env:
- name: SACKD_OPTIONS
value: -D --conf-server={{- printf "%s:%s" (include "slurm.controller.name" .) (include "slurm.controller.port" .) }}
ports:
- name: ssh
containerPort: 22
{{- with .Values.login.resources }}
resources:
{{- toYaml . | nindent 12 }}
{{- end }}{{- /* with .Values.login.resources */}}
readinessProbe:
exec:
command:
- test
- -S
- /run/slurm/sack.socket
volumeMounts:
{{- include "slurm.volumeMounts" . | nindent 12 }}
- name: authsocket
mountPath: /run/slurm
{{- with .Values.login.extraVolumeMounts }}
{{- toYaml . | nindent 12 }}
{{- end }}{{- /* range $vol := $.Values.login.extraVolumeMounts */}}
{{- with .Values.accounting.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}{{- /* with .Values.accounting.nodeSelector */}}
{{- with .Values.login.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}{{- /* with .Values.login.affinity */}}
{{- with .Values.login.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}{{- /* with .Values.login.tolerations */}}
volumes:
{{- include "slurm.volumes" . | nindent 8 }}
- name: slurm-config
projected:
defaultMode: 0600
sources:
- secret:
name: {{ include "slurm.auth.secretName" . }}
{{- with .Values.login.extraVolumes }}
{{- toYaml . | nindent 8 }}
{{- end }}{{- /* with .Values.login.volumes */}}
{{- end }}{{- /* if .Values.login.enabled */}}
18 changes: 18 additions & 0 deletions helm/slurm/templates/login/login-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{{- if .Values.login.enabled }}
apiVersion: v1
kind: Service
metadata:
name: slurm-login
namespace: {{ .Release.Namespace }}
labels:
{{- include "slurm.login.labels" . | nindent 4 }}
spec:
type: ClusterIP
selector:
{{- include "slurm.login.selectorLabels" . | nindent 4 }}
ports:
- name: ssh
port: 22
targetPort: ssh
protocol: TCP
{{- end }}{{- /* if .Values.login.enabled */}}
4 changes: 4 additions & 0 deletions helm/slurm/templates/restapi/restapi-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ spec:
affinity:
{{- .Values.restapi.affinity | toYaml | nindent 8 }}
{{- end }}{{- /* if .Values.restapi.affinity */}}
{{- if .Values.restapi.tolerations }}
tolerations:
{{- .Values.restapi.tolerations | toYaml | nindent 8 }}
{{- end }}{{- /* if .Values.restapi.tolerations */}}
{{- include "slurm.imagePullSecrets" . | nindent 6 }}
initContainers:
- name: init
Expand Down
Loading