@@ -17,32 +17,37 @@ import (
1717
1818 api "flux-framework/flux-operator/api/v1alpha1"
1919
20+ ctrl "sigs.k8s.io/controller-runtime"
2021 jobset "sigs.k8s.io/jobset/api/v1alpha1"
2122)
2223
2324func (r * MiniClusterReconciler ) newJobSet (
2425 cluster * api.MiniCluster ,
2526) (* jobset.JobSet , error ) {
2627
27- suspend := true
28+ // When suspend is true we have a hard time debugging jobs, so keep false
29+ suspend := false
2830 jobs := jobset.JobSet {
2931 ObjectMeta : metav1.ObjectMeta {
30- Name : cluster . Name ,
32+ Name : "minicluster" ,
3133 Namespace : cluster .Namespace ,
3234 Labels : cluster .Spec .JobLabels ,
3335 },
3436 Spec : jobset.JobSetSpec {
3537
36- // Suspend child jobs (the worker pods) when broker finishes
38+ // This might be the control for child jobs (worker)
39+ // But I don't think we need this anymore.
3740 Suspend : & suspend ,
3841 // TODO decide on FailurePolicy here
3942 // default is to fail if all jobs in jobset fail
4043 },
4144 }
4245
4346 // Get leader broker job, the parent in the JobSet (worker or follower pods)
47+ // Both are required to be in indexed completion mode to have a service!
48+ // I'm not sure that totally makes sense, will suggest a change.
4449 // cluster, size, entrypoint, indexed
45- leaderJob , err := r .getJob (cluster , 1 , "broker" , false )
50+ leaderJob , err := r .getJob (cluster , 1 , "broker" , true )
4651 if err != nil {
4752 return & jobs , err
4853 }
@@ -51,10 +56,11 @@ func (r *MiniClusterReconciler) newJobSet(
5156 return & jobs , err
5257 }
5358 jobs .Spec .ReplicatedJobs = []jobset.ReplicatedJob {leaderJob , workerJob }
59+ ctrl .SetControllerReference (cluster , & jobs , r .Scheme )
5460 return & jobs , nil
5561}
5662
57- // getBrokerJob creates the job for the main leader broker
63+ // getJob creates a job for a main leader ( broker) or worker (followers)
5864func (r * MiniClusterReconciler ) getJob (
5965 cluster * api.MiniCluster ,
6066 size int32 ,
@@ -64,18 +70,19 @@ func (r *MiniClusterReconciler) getJob(
6470
6571 backoffLimit := int32 (100 )
6672 podLabels := r .getPodLabels (cluster )
67- enableDNSHostnames := true
73+ enableDNSHostnames := false
6874 completionMode := batchv1 .NonIndexedCompletion
6975
7076 if indexed {
7177 completionMode = batchv1 .IndexedCompletion
7278 }
7379
74- // TODO how are these named
7580 job := jobset.ReplicatedJob {
7681 Name : cluster .Name + "-" + entrypoint ,
7782
78- // Allow pods to be reached by their hostnames! A simple boolean! Chef's kiss!
83+ // This would allow pods to be reached by their hostnames!
84+ // It doesn't work for the Flux broker config at the moment,
85+ // but could if we are allowed to specify the service name.
7986 // <jobSet.name>-<spec.replicatedJob.name>-<job-index>-<pod-index>.<jobSet.name>-<spec.replicatedJob.name>
8087 Network : & jobset.Network {
8188 EnableDNSHostnames : & enableDNSHostnames ,
@@ -110,7 +117,7 @@ func (r *MiniClusterReconciler) getJob(
110117 },
111118 Spec : corev1.PodSpec {
112119 // matches the service
113- // Subdomain: restfulServiceName,
120+ Subdomain : restfulServiceName ,
114121 Volumes : getVolumes (cluster , entrypoint ),
115122 RestartPolicy : corev1 .RestartPolicyOnFailure ,
116123 ImagePullSecrets : getImagePullSecrets (cluster ),
@@ -130,7 +137,12 @@ func (r *MiniClusterReconciler) getJob(
130137
131138 // Get volume mounts, add on container specific ones
132139 mounts := getVolumeMounts (cluster )
133- containers , err := r .getContainers (cluster .Spec .Containers , cluster .Name , mounts )
140+ containers , err := r .getContainers (
141+ cluster .Spec .Containers ,
142+ cluster .Name ,
143+ mounts ,
144+ entrypoint ,
145+ )
134146 jobspec .Template .Spec .Containers = containers
135147 job .Template .Spec = jobspec
136148 return job , err
0 commit comments