diff --git a/api/v1beta1/disk_full.go b/api/v1beta1/disk_full.go new file mode 100644 index 0000000000..b257627793 --- /dev/null +++ b/api/v1beta1/disk_full.go @@ -0,0 +1,130 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package v1beta1 + +import ( + "fmt" + "strconv" + "strings" + + "github.com/hashicorp/go-multierror" + "k8s.io/apimachinery/pkg/api/resource" +) + +// DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume +type DiskFullSpec struct { + // Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Path string `json:"path" chaos_validate:"required"` + // Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + // Mutually exclusive with Remaining. + // +kubebuilder:validation:Pattern=`^\d{1,3}%$` + Capacity string `json:"capacity,omitempty"` + // Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + // Mutually exclusive with Capacity. + Remaining string `json:"remaining,omitempty"` +} + +// Validate validates args for the given disruption +func (s *DiskFullSpec) Validate() (retErr error) { + if strings.TrimSpace(s.Path) == "" { + retErr = multierror.Append(retErr, fmt.Errorf("the path of the disk full disruption must not be empty")) + } + + hasCapacity := s.Capacity != "" + hasRemaining := s.Remaining != "" + + if hasCapacity && hasRemaining { + retErr = multierror.Append(retErr, fmt.Errorf("capacity and remaining are mutually exclusive, only one can be set")) + } + + if !hasCapacity && !hasRemaining { + retErr = multierror.Append(retErr, fmt.Errorf("one of capacity or remaining must be set")) + } + + if hasCapacity { + if err := validateCapacity(s.Capacity); err != nil { + retErr = multierror.Append(retErr, err) + } + } + + if hasRemaining { + if err := validateRemaining(s.Remaining); err != nil { + retErr = multierror.Append(retErr, err) + } + } + + return retErr +} + +func validateCapacity(capacity string) error { + if !strings.HasSuffix(capacity, "%") { + return fmt.Errorf("capacity must be a percentage suffixed with %%, got %q", capacity) + } + + valueStr := strings.TrimSuffix(capacity, "%") + + value, err := strconv.Atoi(valueStr) + if err != nil { + return fmt.Errorf("capacity percentage must be an integer, got %q: %w", valueStr, err) + } + + if value < 1 || value > 100 { + return fmt.Errorf("capacity percentage must be between 1 and 100, got %d", value) + } + + return nil +} + +func validateRemaining(remaining string) error { + qty, err := resource.ParseQuantity(remaining) + if err != nil { + return fmt.Errorf("remaining must be a valid Kubernetes resource quantity (e.g., 50Mi, 1Gi), got %q: %w", remaining, err) + } + + if qty.Value() < 0 { + return fmt.Errorf("remaining must not be negative, got %q", remaining) + } + + return nil +} + +// GenerateArgs generates injection or cleanup pod arguments for the given spec +func (s *DiskFullSpec) GenerateArgs() []string { + args := []string{ + "disk-full", + "--path", + s.Path, + } + + if s.Capacity != "" { + args = append(args, "--capacity", s.Capacity) + } + + if s.Remaining != "" { + args = append(args, "--remaining", s.Remaining) + } + + return args +} + +// Explain returns a human-readable description of the disruption +func (s *DiskFullSpec) Explain() []string { + explanation := fmt.Sprintf("spec.diskFull will fill the volume mounted at %s", s.Path) + + if s.Capacity != "" { + explanation += fmt.Sprintf(" to %s of its total capacity", s.Capacity) + } + + if s.Remaining != "" { + explanation += fmt.Sprintf(", leaving only %s of free space", s.Remaining) + } + + explanation += ", causing ENOSPC errors on subsequent write operations." + + return []string{"", explanation} +} diff --git a/api/v1beta1/disk_full_test.go b/api/v1beta1/disk_full_test.go new file mode 100644 index 0000000000..0dce828ded --- /dev/null +++ b/api/v1beta1/disk_full_test.go @@ -0,0 +1,198 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package v1beta1_test + +import ( + . "github.com/DataDog/chaos-controller/api/v1beta1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("DiskFullSpec", func() { + When("Call the 'Validate' method", func() { + DescribeTable("success cases", + func(spec DiskFullSpec) { + Expect(spec.Validate()).Should(Succeed()) + }, + Entry("with capacity percentage", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + }, + ), + Entry("with capacity at 1%", + DiskFullSpec{ + Path: "/data", + Capacity: "1%", + }, + ), + Entry("with capacity at 100%", + DiskFullSpec{ + Path: "/data", + Capacity: "100%", + }, + ), + Entry("with remaining in Mi", + DiskFullSpec{ + Path: "/data", + Remaining: "50Mi", + }, + ), + Entry("with remaining in Gi", + DiskFullSpec{ + Path: "/var/log", + Remaining: "1Gi", + }, + ), + Entry("with remaining at 0", + DiskFullSpec{ + Path: "/data", + Remaining: "0", + }, + ), + ) + + DescribeTable("error cases", + func(spec DiskFullSpec, expectedErrors []string) { + err := spec.Validate() + Expect(err).To(HaveOccurred()) + for _, expected := range expectedErrors { + Expect(err.Error()).To(ContainSubstring(expected)) + } + }, + Entry("with empty path", + DiskFullSpec{ + Path: "", + Capacity: "95%", + }, + []string{"the path of the disk full disruption must not be empty"}, + ), + Entry("with blank path", + DiskFullSpec{ + Path: " ", + Capacity: "95%", + }, + []string{"the path of the disk full disruption must not be empty"}, + ), + Entry("with both capacity and remaining set", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + Remaining: "50Mi", + }, + []string{"capacity and remaining are mutually exclusive"}, + ), + Entry("with neither capacity nor remaining set", + DiskFullSpec{ + Path: "/data", + }, + []string{"one of capacity or remaining must be set"}, + ), + Entry("with capacity missing percent suffix", + DiskFullSpec{ + Path: "/data", + Capacity: "95", + }, + []string{"capacity must be a percentage suffixed with %"}, + ), + Entry("with capacity at 0%", + DiskFullSpec{ + Path: "/data", + Capacity: "0%", + }, + []string{"capacity percentage must be between 1 and 100"}, + ), + Entry("with capacity at 101%", + DiskFullSpec{ + Path: "/data", + Capacity: "101%", + }, + []string{"capacity percentage must be between 1 and 100"}, + ), + Entry("with non-numeric capacity", + DiskFullSpec{ + Path: "/data", + Capacity: "abc%", + }, + []string{"capacity percentage must be an integer"}, + ), + Entry("with invalid remaining quantity", + DiskFullSpec{ + Path: "/data", + Remaining: "not-a-quantity", + }, + []string{"remaining must be a valid Kubernetes resource quantity"}, + ), + Entry("with negative remaining", + DiskFullSpec{ + Path: "/data", + Remaining: "-1Mi", + }, + []string{"remaining must not be negative"}, + ), + Entry("with empty path and no capacity/remaining", + DiskFullSpec{ + Path: "", + }, + []string{ + "the path of the disk full disruption must not be empty", + "one of capacity or remaining must be set", + }, + ), + ) + }) + + When("Call the 'GenerateArgs' method", func() { + DescribeTable("success cases", + func(spec DiskFullSpec, expectedArgs []string) { + expectedArgs = append([]string{"disk-full"}, expectedArgs...) + args := spec.GenerateArgs() + Expect(args).Should(Equal(expectedArgs)) + }, + Entry("with capacity", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + }, + []string{"--path", "/data", "--capacity", "95%"}, + ), + Entry("with remaining", + DiskFullSpec{ + Path: "/data", + Remaining: "50Mi", + }, + []string{"--path", "/data", "--remaining", "50Mi"}, + ), + ) + }) + + When("Call the 'Explain' method", func() { + It("explains capacity mode", func() { + spec := DiskFullSpec{ + Path: "/data", + Capacity: "95%", + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("/data")) + Expect(explanation[1]).To(ContainSubstring("95%")) + Expect(explanation[1]).To(ContainSubstring("ENOSPC")) + }) + + It("explains remaining mode", func() { + spec := DiskFullSpec{ + Path: "/var/log", + Remaining: "50Mi", + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("/var/log")) + Expect(explanation[1]).To(ContainSubstring("50Mi")) + Expect(explanation[1]).To(ContainSubstring("ENOSPC")) + }) + + }) +}) diff --git a/api/v1beta1/disruption_types.go b/api/v1beta1/disruption_types.go index ac7a6d342b..8ae20059b5 100644 --- a/api/v1beta1/disruption_types.go +++ b/api/v1beta1/disruption_types.go @@ -80,6 +80,8 @@ type DisruptionSpec struct { // +nullable DiskPressure *DiskPressureSpec `json:"diskPressure,omitempty"` // +nullable + DiskFull *DiskFullSpec `json:"diskFull,omitempty"` + // +nullable DiskFailure *DiskFailureSpec `json:"diskFailure,omitempty"` // +nullable GRPC *GRPCDisruptionSpec `json:"grpc,omitempty"` @@ -696,25 +698,25 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re } // Rule: At least one disruption kind must be applied - if s.CPUPressure == nil && s.MemoryPressure == nil && s.DiskPressure == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.DNS == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil { + if s.CPUPressure == nil && s.MemoryPressure == nil && s.DiskPressure == nil && s.DiskFull == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.DNS == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil { retErr = multierror.Append(retErr, errors.New("at least one disruption kind must be specified, please read the docs to see your options")) } // Rule: ContainerFailure, NodeFailure, and PodReplacement disruptions are not compatible with other failure types if s.ContainerFailure != nil { - if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.NodeFailure != nil || s.PodReplacement != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFull != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.NodeFailure != nil || s.PodReplacement != nil { retErr = multierror.Append(retErr, errors.New("container failure disruptions are not compatible with other disruption kinds. The container failure will remove the impact of the other disruption types")) } } if s.NodeFailure != nil { - if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.PodReplacement != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFull != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.PodReplacement != nil { retErr = multierror.Append(retErr, errors.New("node failure disruptions are not compatible with other disruption kinds. The node failure will remove the impact of the other disruption types")) } } if s.PodReplacement != nil { - if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.NodeFailure != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFull != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.NodeFailure != nil { retErr = multierror.Append(retErr, errors.New("pod replacement disruptions are not compatible with other disruption kinds. The pod replacement will remove the impact of the other disruption types")) } // Rule: container failure not possible if disruption is node-level @@ -731,6 +733,7 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re s.PodReplacement != nil || s.ContainerFailure != nil || s.DiskPressure != nil || + s.DiskFull != nil || s.GRPC != nil || s.DiskFailure != nil { retErr = multierror.Append(retErr, errors.New("OnInit is only compatible with network disruptions")) @@ -805,6 +808,10 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re retErr = multierror.Append(retErr, errors.New("DNS disruptions can only be applied at the pod level")) } + if s.DiskFull != nil && s.Level == chaostypes.DisruptionLevelNode { + retErr = multierror.Append(retErr, errors.New("disk full disruptions can only be applied at the pod level")) + } + // Rule: count must be valid if err := ValidateCount(s.Count); err != nil { retErr = multierror.Append(retErr, err) @@ -842,6 +849,8 @@ func (s DisruptionSpec) DisruptionKindPicker(kind chaostypes.DisruptionKindName) disruptionKind = s.DNS case chaostypes.DisruptionKindPodReplacement: disruptionKind = s.PodReplacement + case chaostypes.DisruptionKindDiskFull: + disruptionKind = s.DiskFull case chaostypes.DisruptionKindDiskFailure: disruptionKind = s.DiskFailure } @@ -930,6 +939,10 @@ func (s DisruptionSpec) DisruptionCount() int { count++ } + if s.DiskFull != nil { + count++ + } + return count } @@ -1086,6 +1099,10 @@ func (s DisruptionSpec) Explain() []string { explanation = append(explanation, s.DiskFailure.Explain()...) } + if s.DiskFull != nil { + explanation = append(explanation, s.DiskFull.Explain()...) + } + if s.GRPC != nil { explanation = append(explanation, s.GRPC.Explain()...) } diff --git a/api/v1beta1/disruption_webhook.go b/api/v1beta1/disruption_webhook.go index 187baa0dba..9ecec7d839 100644 --- a/api/v1beta1/disruption_webhook.go +++ b/api/v1beta1/disruption_webhook.go @@ -19,6 +19,7 @@ import ( "go.opentelemetry.io/otel/trace" "go.uber.org/zap" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" @@ -505,6 +506,14 @@ func (d *Disruption) initialSafetyNets() ([]string, error) { responses = append(responses, "the specified path for the disk failure disruption targeting a node must not be \"/\"") } } + + if d.Spec.DiskFull != nil { + if caught, response := safetyNetDiskFullMinFreeSpace(d); caught { + logger.Debugw("the specified disk full disruption breaches the minimum free space safety floor", tagutil.SafetyNetCatchKey, "DiskFull") + + responses = append(responses, response) + } + } } if !allowNodeFailure && d.Spec.NodeFailure != nil { @@ -733,6 +742,28 @@ func safetyNetAttemptsNodeRootDiskFailure(r *Disruption) bool { return false } +// safetyNetDiskFullMinFreeSpace checks that the disk full disruption does not breach the 1Mi minimum free space floor. +func safetyNetDiskFullMinFreeSpace(r *Disruption) (bool, string) { + if r.Spec.Unsafemode != nil && r.Spec.Unsafemode.AllowDiskFullNoFloor { + return false, "" + } + + if r.Spec.DiskFull.Capacity == "100%" { + return true, "disk full disruption with 100% capacity will leave 0 bytes free; " + + "set unsafeMode.allowDiskFullNoFloor=true to override the 1Mi safety floor" + } + + if r.Spec.DiskFull.Remaining != "" { + qty, err := resource.ParseQuantity(r.Spec.DiskFull.Remaining) + if err == nil && qty.Value() < 1024*1024 { + return true, fmt.Sprintf("disk full disruption remaining space %s is below the 1Mi safety floor; "+ + "set unsafeMode.allowDiskFullNoFloor=true to override", r.Spec.DiskFull.Remaining) + } + } + + return false, "" +} + // checkForDisabledDisruptions returns an error if `r` specifies any of the disruption kinds in setupWebhookConfig.DisabledDisruptions func checkForDisabledDisruptions(r *Disruption) error { for _, disKind := range chaostypes.DisruptionKindNames { diff --git a/api/v1beta1/safemode.go b/api/v1beta1/safemode.go index a174d6051e..805b3dbf1c 100644 --- a/api/v1beta1/safemode.go +++ b/api/v1beta1/safemode.go @@ -17,6 +17,7 @@ type UnsafemodeSpec struct { DisableNeitherHostNorPort bool `json:"disableNeitherHostNorPort,omitempty"` DisableSpecificContainDisk bool `json:"disableSpecificContainDisk,omitempty"` AllowRootDiskFailure bool `json:"allowRootDiskFailure,omitempty"` + AllowDiskFullNoFloor bool `json:"allowDiskFullNoFloor,omitempty"` Config *Config `json:"config,omitempty"` } diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index c9271bd209..37a0a8e9f2 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -183,6 +183,21 @@ func (in *DiskFailureSpec) DeepCopy() *DiskFailureSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiskFullSpec) DeepCopyInto(out *DiskFullSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskFullSpec. +func (in *DiskFullSpec) DeepCopy() *DiskFullSpec { + if in == nil { + return nil + } + out := new(DiskFullSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DiskPressureSpec) DeepCopyInto(out *DiskPressureSpec) { *out = *in @@ -646,6 +661,11 @@ func (in *DisruptionSpec) DeepCopyInto(out *DisruptionSpec) { *out = new(DiskPressureSpec) (*in).DeepCopyInto(*out) } + if in.DiskFull != nil { + in, out := &in.DiskFull, &out.DiskFull + *out = new(DiskFullSpec) + **out = **in + } if in.DiskFailure != nil { in, out := &in.DiskFailure, &out.DiskFailure *out = new(DiskFailureSpec) diff --git a/builderstest/chaospod.go b/builderstest/chaospod.go index 734e7451d5..57641654f7 100644 --- a/builderstest/chaospod.go +++ b/builderstest/chaospod.go @@ -6,6 +6,7 @@ package builderstest_test import ( + "path/filepath" "time" "github.com/DataDog/chaos-controller/env" @@ -175,6 +176,30 @@ func (b *ChaosPodBuilder) WithPullSecrets(imagePullSecrets []v1.LocalObjectRefer return b } +// WithDiskFullMount adds the writable shadow mount for disk-full disruptions. +func (b *ChaosPodBuilder) WithDiskFullMount(diskFullPath string) *ChaosPodBuilder { + b.modifiers = append( + b.modifiers, + func() { + hostPathDirectory := v1.HostPathDirectory + b.Spec.Volumes = append(b.Spec.Volumes, v1.Volume{ + Name: "disk-full-target", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: diskFullPath, + Type: &hostPathDirectory, + }, + }, + }) + b.Spec.Containers[0].VolumeMounts = append(b.Spec.Containers[0].VolumeMounts, v1.VolumeMount{ + Name: "disk-full-target", + MountPath: filepath.Join("/mnt/host", diskFullPath), + }) + }) + + return b +} + // WithChaosSpec sets the chaos-specific pod spec. func (b *ChaosPodBuilder) WithChaosSpec(targetNodeName string, terminationGracePeriod, activeDeadlineSeconds int64, args []string, hostPathDirectory, pathFile v1.HostPathType, serviceAccountName string, image string, logLevel string) *ChaosPodBuilder { b.modifiers = append( diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml index 14af5850d5..a0efe99bbe 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml @@ -162,6 +162,28 @@ spec: required: - paths type: object + diskFull: + description: DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume + nullable: true + properties: + capacity: + description: |- + Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + Mutually exclusive with Remaining. + pattern: ^\d{1,3}%$ + type: string + path: + description: Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + minLength: 1 + type: string + remaining: + description: |- + Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + Mutually exclusive with Capacity. + type: string + required: + - path + type: object diskPressure: description: DiskPressureSpec represents a disk pressure disruption nullable: true @@ -689,6 +711,8 @@ spec: UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true properties: + allowDiskFullNoFloor: + type: boolean allowRootDiskFailure: type: boolean config: diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml index e9f636d1f6..8aa1a3405c 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml @@ -163,6 +163,28 @@ spec: required: - paths type: object + diskFull: + description: DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume + nullable: true + properties: + capacity: + description: |- + Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + Mutually exclusive with Remaining. + pattern: ^\d{1,3}%$ + type: string + path: + description: Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + minLength: 1 + type: string + remaining: + description: |- + Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + Mutually exclusive with Capacity. + type: string + required: + - path + type: object diskPressure: description: DiskPressureSpec represents a disk pressure disruption nullable: true @@ -690,6 +712,8 @@ spec: UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true properties: + allowDiskFullNoFloor: + type: boolean allowRootDiskFailure: type: boolean config: diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml index fa6b4be1e3..4fdb65fef3 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml @@ -153,6 +153,28 @@ spec: required: - paths type: object + diskFull: + description: DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume + nullable: true + properties: + capacity: + description: |- + Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + Mutually exclusive with Remaining. + pattern: ^\d{1,3}%$ + type: string + path: + description: Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + minLength: 1 + type: string + remaining: + description: |- + Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + Mutually exclusive with Capacity. + type: string + required: + - path + type: object diskPressure: description: DiskPressureSpec represents a disk pressure disruption nullable: true @@ -680,6 +702,8 @@ spec: UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true properties: + allowDiskFullNoFloor: + type: boolean allowRootDiskFailure: type: boolean config: diff --git a/cli/injector/disk_full.go b/cli/injector/disk_full.go new file mode 100644 index 0000000000..0370c615cf --- /dev/null +++ b/cli/injector/disk_full.go @@ -0,0 +1,63 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package main + +import ( + "errors" + "os" + "strings" + + "github.com/spf13/cobra" + + "github.com/DataDog/chaos-controller/api/v1beta1" + "github.com/DataDog/chaos-controller/injector" + "github.com/DataDog/chaos-controller/o11y/tags" +) + +var diskFullCmd = &cobra.Command{ + Use: "disk-full", + Short: "Disk full (ENOSPC) subcommand", + Run: injectAndWait, + PreRun: func(cmd *cobra.Command, args []string) { + path, _ := cmd.Flags().GetString("path") + capacity, _ := cmd.Flags().GetString("capacity") + remaining, _ := cmd.Flags().GetString("remaining") + + spec := v1beta1.DiskFullSpec{ + Path: path, + Capacity: capacity, + Remaining: remaining, + } + + for _, config := range configs { + inj, err := injector.NewDiskFullInjector(spec, injector.DiskFullInjectorConfig{Config: config}) + if err != nil { + if errors.Is(errors.Unwrap(err), os.ErrNotExist) || strings.Contains(err.Error(), "No such file or directory") { + log.Errorw("error initializing the disk full injector because the given path does not exist", tags.ErrorKey, err) + } else if errors.Is(errors.Unwrap(err), os.ErrPermission) { + log.Errorw("error initializing the disk full injector because the given path is not accessible", tags.ErrorKey, err) + } else { + log.Fatalw("error initializing the disk full injector", tags.ErrorKey, err) + } + } + + if inj == nil { + log.Debugln("skipping this injector because path cannot be found on specified container") + continue + } + + injectors = append(injectors, inj) + } + }, +} + +func init() { + diskFullCmd.Flags().String("path", "", "Path to apply disk full disruption to") + diskFullCmd.Flags().String("capacity", "", "Target fill percentage of total volume capacity (e.g., 95%)") + diskFullCmd.Flags().String("remaining", "", "Amount of free space to leave on the volume (e.g., 50Mi)") + + _ = cobra.MarkFlagRequired(diskFullCmd.PersistentFlags(), "path") +} diff --git a/cli/injector/main.go b/cli/injector/main.go index 69d91f603b..19b67ea2d5 100644 --- a/cli/injector/main.go +++ b/cli/injector/main.go @@ -90,6 +90,7 @@ func init() { rootCmd.AddCommand(memoryPressureCmd) rootCmd.AddCommand(memoryPressureStressCmd) rootCmd.AddCommand(diskFailureCmd) + rootCmd.AddCommand(diskFullCmd) rootCmd.AddCommand(diskPressureCmd) rootCmd.AddCommand(grpcDisruptionCmd) rootCmd.AddCommand(dnsDisruptionCmd) diff --git a/docs/README.md b/docs/README.md index 157cbe7ac9..72f9343ef5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -22,6 +22,7 @@ This folder contains the detailed documentation of most parts of the project. - [CPU Pressure](cpu_pressure.md) - [Memory Pressure](memory_disruption.md) - [Disk Failure](disk_failure.md) + - [Disk Full](disk_full.md) - [Disk Pressure](disk_pressure.md) - [DNS Disruption](dns_disruption.md) - [GRPC Disruption](grpc_disruption.md) diff --git a/docs/disk_full.md b/docs/disk_full.md new file mode 100644 index 0000000000..42cff0cbab --- /dev/null +++ b/docs/disk_full.md @@ -0,0 +1,129 @@ +# Disk full (ENOSPC) + +The `diskFull` field offers a way to genuinely fill a target pod volume, causing real ENOSPC errors on all subsequent write operations. Unlike disk pressure (which throttles I/O) or disk failure (which intercepts `openat` syscalls), this disruption makes the filesystem actually run out of space — visible to `df`, `statfs()`, Kubernetes eviction, and monitoring systems. + +## How it works + +The injector creates a **ballast file** (`.chaos-diskfull-{disruption-name}`) at the target path using the `fallocate(2)` syscall, which is instant (O(1), metadata-only allocation on ext4/xfs). On filesystems that don't support `fallocate`, it falls back to writing zeros. + +When the disruption is cleaned up, the ballast file is removed and space is freed immediately. + +## Spec fields + +| Field | Type | Required | Description | +|-------------|--------|----------|-------------| +| `path` | string | Yes | Mount path inside the target pod to fill (e.g., `/data`, `/var/log`) | +| `capacity` | string | One of | Fill to this percentage of total volume capacity (e.g., `"95%"`) | +| `remaining` | string | One of | Leave only this much free space on the volume (e.g., `"50Mi"`, `"1Gi"`) | + +`capacity` and `remaining` are **mutually exclusive** — exactly one must be set. + +## Examples + +### Fill to 95% capacity + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-test + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 10m + diskFull: + path: "/data" + capacity: "95%" +``` + +### Leave only 10Mi free + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-remaining + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 5m + diskFull: + path: "/var/log" + remaining: "10Mi" +``` + +### Fill to 100% (requires unsafeMode) + +By default, the controller enforces a 1Mi minimum free space safety floor to prevent filesystem journal corruption. To fill completely: + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-complete + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 5m + unsafeMode: + allowDiskFullNoFloor: true + diskFull: + path: "/data" + capacity: "100%" +``` + +## Safety + +### Minimum free space floor + +A 1Mi safety floor is enforced by default. This prevents: +- Filesystem journal corruption on ext4 +- Inability to perform cleanup operations +- Cascade failures from completely exhausted filesystems + +Override with `unsafeMode.allowDiskFullNoFloor: true`. + +### Ephemeral storage eviction + +If the target pod has `resources.limits.ephemeral-storage` set and the target volume is ephemeral (`emptyDir`), filling the volume may cause the kubelet to evict the pod. This is **realistic behavior** — it's exactly what would happen if the application itself filled the disk. + +The controller emits a warning Kubernetes Event when this condition is detected, but does not block the disruption. + +### Level restriction + +Disk full disruptions are **pod-level only**. Node-level disk fill is not supported because it can crash the kubelet and affect all pods on the node. + +## Manual cleanup instructions + +If the chaos pod crashes before cleanup and the finalizer fails: + +1. Identify the ballast file on the target node: + +```shell +find /var/lib/kubelet/pods/ -name ".chaos-diskfull-*" -type f +``` + +2. Remove it: + +```shell +rm /path/to/.chaos-diskfull- +``` + +Space is freed immediately upon file removal. + +## Comparison with other disk disruptions + +| Disruption | Mechanism | ENOSPC on writes? | Visible to `df`/monitoring? | Affects open FDs? | +|---|---|---|---|---| +| **Disk Pressure** | Cgroup blkio throttling | No (slows I/O only) | No | N/A | +| **Disk Failure** | eBPF on `openat` | Only on file open | No | No | +| **Disk Full** | Real space allocation | Yes (all syscalls) | Yes | Yes | diff --git a/docs/disruption_catalogue.md b/docs/disruption_catalogue.md index 547fed4c80..fe87fdd0fb 100644 --- a/docs/disruption_catalogue.md +++ b/docs/disruption_catalogue.md @@ -15,14 +15,15 @@ | [CPU Pressure](#4-cpu-pressure) | Consume CPU cycles in target cgroup | Pod, Node | Yes | Yes | | [Memory Pressure](#5-memory-pressure) | Gradually consume memory in target cgroup | Pod, Node | Yes | Yes | | [Disk Pressure](#6-disk-pressure) | Throttle read/write I/O throughput | Pod, Node | Yes | Yes | -| [Disk Failure](#7-disk-failure) | Fail file open syscalls via eBPF | Pod, Node | Yes* | Yes | -| [Container Failure](#8-container-failure) | Kill container processes (SIGTERM/SIGKILL) | Pod | No | No (exclusive) | -| [Node Failure](#9-node-failure) | Kernel panic or power-off a node | Node | No | No (exclusive) | -| [Pod Replacement](#10-pod-replacement) | Cordon node, delete pod and optionally PVCs | Pod | No | No (exclusive) | +| [Disk Full](#7-disk-full) | Fill volume to trigger real ENOSPC | Pod | Yes | Yes | +| [Disk Failure](#8-disk-failure) | Fail file open syscalls via eBPF | Pod, Node | Yes* | Yes | +| [Container Failure](#9-container-failure) | Kill container processes (SIGTERM/SIGKILL) | Pod | No | No (exclusive) | +| [Node Failure](#10-node-failure) | Kernel panic or power-off a node | Node | No | No (exclusive) | +| [Pod Replacement](#11-pod-replacement) | Cordon node, delete pod and optionally PVCs | Pod | No | No (exclusive) | \* Disk Failure injection is removed when the injector process exits. -**Combination rule:** Network, DNS, gRPC, CPU Pressure, Memory Pressure, Disk Pressure, and Disk Failure can all be applied together in a single Disruption resource. Container Failure, Node Failure, and Pod Replacement are mutually exclusive with every other disruption type. +**Combination rule:** Network, DNS, gRPC, CPU Pressure, Memory Pressure, Disk Pressure, Disk Full, and Disk Failure can all be applied together in a single Disruption resource. Container Failure, Node Failure, and Pod Replacement are mutually exclusive with every other disruption type. --- @@ -738,7 +739,53 @@ spec: --- -## 7. Disk Failure +## 7. Disk Full + +Fills a target pod volume using the `fallocate(2)` syscall, causing real ENOSPC errors on all subsequent write operations. Unlike Disk Pressure (throttles I/O) or Disk Failure (intercepts `openat`), this disruption makes the filesystem genuinely run out of space — visible to `df`, `statfs()`, Kubernetes eviction, and monitoring. + +### Configuration + +| Field | Type | Default | Description | +| ----------- | ------ | ---------- | ------------------------------------------------------------- | +| `path` | string | (required) | Mount path inside the target pod to fill | +| `capacity` | string | — | Fill to this percentage of total capacity (e.g., `"95%"`) | +| `remaining` | string | — | Leave only this much free space (e.g., `"50Mi"`, `"1Gi"`) | + +`capacity` and `remaining` are mutually exclusive — exactly one must be set. + +### Constraints and Limitations + +| Constraint | Detail | +| --- | --- | +| Level | Pod only | +| Safety floor | 1Mi minimum free space enforced by default. Override with `unsafeMode.allowDiskFullNoFloor: true` | +| Ephemeral storage | Filling a volume may trigger kubelet eviction if the pod has `ephemeral-storage` limits | +| Filesystem support | `fallocate(2)` is instant on ext4/xfs. Falls back to writing zeros on NFS/FUSE | + +### Example + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-test + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 10m + diskFull: + path: "/data" + capacity: "95%" +``` + +See [docs/disk_full.md](disk_full.md) for full documentation including manual cleanup instructions. + +--- + +## 8. Disk Failure Uses eBPF to intercept `openat` syscalls and return error codes, simulating file system failures. @@ -791,7 +838,7 @@ spec: --- -## 8. Container Failure +## 9. Container Failure Sends a termination signal to container processes. @@ -832,7 +879,7 @@ spec: --- -## 9. Node Failure +## 10. Node Failure Triggers a kernel panic or power-off by writing to `/proc/sysrq-trigger`. This is **irreversible** — the node becomes unavailable. @@ -892,7 +939,7 @@ spec: --- -## 10. Pod Replacement +## 11. Pod Replacement Simulates complete pod rescheduling: cordons the node, optionally deletes PVCs, deletes the pod, then uncordons. diff --git a/ebpf/const-arm.go b/ebpf/const-arm.go index 10c4318297..78fb005551 100644 --- a/ebpf/const-arm.go +++ b/ebpf/const-arm.go @@ -9,3 +9,4 @@ package ebpf const SysOpenat = "__arm64_sys_openat" +const SysWrite = "__arm64_sys_write" diff --git a/ebpf/const-x64.go b/ebpf/const-x64.go index 3c73bb35b9..0d21b5442b 100644 --- a/ebpf/const-x64.go +++ b/ebpf/const-x64.go @@ -9,3 +9,4 @@ package ebpf const SysOpenat = "__x64_sys_openat" +const SysWrite = "__x64_sys_write" diff --git a/examples/complete.yaml b/examples/complete.yaml index b51c3295df..ea4235756e 100644 --- a/examples/complete.yaml +++ b/examples/complete.yaml @@ -111,6 +111,10 @@ spec: throttling: readBytesPerSec: 1024 # optional, read throttling in bytes per sec writeBytesPerSec: 2048 # optional, write throttling in bytes per sec + diskFull: # disk full (ENOSPC) disruption + path: /mnt/data # mount path inside the target pod to fill (e.g., "/data", "/var/log") + capacity: "95%" # optional (mutually exclusive with remaining), target fill percentage of total volume capacity + # remaining: "50Mi" # optional (mutually exclusive with capacity), amount of free space to leave on the volume (e.g., "50Mi", "1Gi") grpc: # disrupt gRPC responses by faking results port: 50051 # port that target grpc server is listening on endpoints: diff --git a/examples/disk_full.yaml b/examples/disk_full.yaml new file mode 100644 index 0000000000..0f93219ba1 --- /dev/null +++ b/examples/disk_full.yaml @@ -0,0 +1,19 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2026 Datadog, Inc. + +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full + namespace: chaos-demo +spec: + level: pod + selector: + service: demo-curl + count: 1 + duration: 10m + diskFull: + path: "/mnt/data" + capacity: "95%" diff --git a/fallocate/fallocate_darwin.go b/fallocate/fallocate_darwin.go new file mode 100644 index 0000000000..eb1eb983ef --- /dev/null +++ b/fallocate/fallocate_darwin.go @@ -0,0 +1,40 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Fallocate implementation for Darwin using F_PREALLOCATE fcntl. +// Based on https://github.com/detailyang/go-fallocate (MIT License). + +package fallocate + +import ( + "os" + "syscall" + "unsafe" +) + +// Fallocate pre-allocates disk space for the given file on macOS. +func Fallocate(file *os.File, offset int64, length int64) error { + if length == 0 { + return nil + } + + fst := syscall.Fstore_t{ + Flags: syscall.F_ALLOCATECONTIG, + Posmode: syscall.F_PREALLOCATE, + Offset: 0, + Length: offset + length, + Bytesalloc: 0, + } + + // Try contiguous allocation first, fall back to non-contiguous + // See: https://lists.apple.com/archives/darwin-dev/2007/Dec/msg00040.html + _, _, err := syscall.Syscall(syscall.SYS_FCNTL, file.Fd(), syscall.F_PREALLOCATE, uintptr(unsafe.Pointer(&fst))) + if err != syscall.Errno(0x0) { + fst.Flags = syscall.F_ALLOCATEALL + _, _, _ = syscall.Syscall(syscall.SYS_FCNTL, file.Fd(), syscall.F_PREALLOCATE, uintptr(unsafe.Pointer(&fst))) + } + + return syscall.Ftruncate(int(file.Fd()), fst.Length) +} diff --git a/fallocate/fallocate_linux.go b/fallocate/fallocate_linux.go new file mode 100644 index 0000000000..ab27c7ef8f --- /dev/null +++ b/fallocate/fallocate_linux.go @@ -0,0 +1,37 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Fallocate implementation for Linux using the fallocate(2) syscall. +// Based on https://github.com/detailyang/go-fallocate (MIT License). +// Falls back to writing zeros if the filesystem does not support fallocate. + +package fallocate + +import ( + "errors" + "os" + "syscall" +) + +// Fallocate allocates disk space for the given file without writing data. +// If the filesystem does not support fallocate (EOPNOTSUPP), it falls back +// to writing zeros. +func Fallocate(file *os.File, offset int64, length int64) error { + if length == 0 { + return nil + } + + err := syscall.Fallocate(int(file.Fd()), 0, offset, length) + if err == nil { + return nil + } + + // Fall back to writing zeros on unsupported filesystems (e.g., NFS, some FUSE) + if errors.Is(err, syscall.EOPNOTSUPP) || errors.Is(err, syscall.ENOTSUP) { + return fallocateWrite(file, offset, length) + } + + return err +} diff --git a/fallocate/fallocate_other.go b/fallocate/fallocate_other.go new file mode 100644 index 0000000000..4186b6940c --- /dev/null +++ b/fallocate/fallocate_other.go @@ -0,0 +1,19 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Fallocate fallback for platforms that don't support fallocate or F_PREALLOCATE. +// Based on https://github.com/detailyang/go-fallocate (MIT License). + +//go:build !linux && !darwin + +package fallocate + +import "os" + +// Fallocate allocates disk space by writing zeros. This is the fallback +// implementation for platforms without native fallocate support. +func Fallocate(file *os.File, offset int64, length int64) error { + return fallocateWrite(file, offset, length) +} diff --git a/fallocate/write.go b/fallocate/write.go new file mode 100644 index 0000000000..06338a557d --- /dev/null +++ b/fallocate/write.go @@ -0,0 +1,38 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package fallocate + +import ( + "io" + "os" +) + +const writeChunkSize = 65536 + +// fallocateWrite allocates disk space by writing zeros in chunks. +// Used as a fallback when the platform or filesystem doesn't support fallocate. +func fallocateWrite(file *os.File, offset int64, length int64) error { + if _, err := file.Seek(offset, io.SeekStart); err != nil { + return err + } + + var buf [writeChunkSize]byte + + for length > 0 { + n := int64(writeChunkSize) + if length < n { + n = length + } + + if _, err := file.Write(buf[:n]); err != nil { + return err + } + + length -= n + } + + return nil +} diff --git a/injector/disk_full.go b/injector/disk_full.go new file mode 100644 index 0000000000..93a6ecfada --- /dev/null +++ b/injector/disk_full.go @@ -0,0 +1,239 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package injector + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/DataDog/chaos-controller/api/v1beta1" + "github.com/DataDog/chaos-controller/env" + "github.com/DataDog/chaos-controller/fallocate" + "github.com/DataDog/chaos-controller/types" + "k8s.io/apimachinery/pkg/api/resource" +) + +const ( + // minFreeSpaceBytes is the safety floor enforced unless unsafemode overrides it (1Mi) + minFreeSpaceBytes = 1024 * 1024 + // ballastFilePrefix is the prefix for ballast files created by the disk full injector + ballastFilePrefix = ".chaos-diskfull-" +) + +type diskFullInjector struct { + spec v1beta1.DiskFullSpec + config DiskFullInjectorConfig + hostPath string + ballastPath string +} + +// DiskFullInjectorConfig is the disk full injector config +type DiskFullInjectorConfig struct { + Config +} + +// NewDiskFullInjector creates a disk full injector with the given config +func NewDiskFullInjector(spec v1beta1.DiskFullSpec, config DiskFullInjectorConfig) (Injector, error) { + path := spec.Path + + // get root mount path + mountHost, ok := os.LookupEnv(env.InjectorMountHost) + if !ok { + return nil, fmt.Errorf("environment variable %s doesn't exist", env.InjectorMountHost) + } + + // get path from container info if we target a pod + if config.Disruption.Level == types.DisruptionLevelPod { + var err error + + path, err = config.TargetContainer.Runtime().HostPath(config.TargetContainer.ID(), spec.Path) + if err != nil { + return nil, fmt.Errorf("error resolving host path for disk full disruption: %w", err) + } + + if len(path) == 0 { + config.Log.Warnf("could not apply injector on container: %s; %s not found on this targeted container.", config.TargetContainer.Name(), spec.Path) + return nil, nil + } + } + + hostPath := filepath.Clean(mountHost + path) + + // validate path exists + if _, err := os.Stat(hostPath); err != nil { + return nil, fmt.Errorf("target path %s does not exist: %w", hostPath, err) + } + + ballastPath := filepath.Join(hostPath, ballastFilePrefix+config.Disruption.DisruptionName) + + return &diskFullInjector{ + spec: spec, + config: config, + hostPath: hostPath, + ballastPath: ballastPath, + }, nil +} + +func (i *diskFullInjector) TargetName() string { + return i.config.TargetName() +} + +func (i *diskFullInjector) GetDisruptionKind() types.DisruptionKindName { + return types.DisruptionKindDiskFull +} + +func (i *diskFullInjector) Inject() error { + return i.injectVolumeFill() +} + +func (i *diskFullInjector) injectVolumeFill() error { + // get filesystem stats + var stat syscall.Statfs_t + if err := syscall.Statfs(i.hostPath, &stat); err != nil { + return fmt.Errorf("error getting filesystem stats for %s: %w", i.hostPath, err) + } + + // Note: on Linux, Blocks/Bavail are in units of Frsize (fragment size), not Bsize. + // On ext4/xfs (the common case), Bsize == Frsize. We use Bsize here for Darwin + // compatibility in tests. The injector runs on Linux where this is correct for + // standard filesystems. + totalBytes := stat.Blocks * uint64(stat.Bsize) + // Bavail excludes space reserved for root (~5% on ext4), so we may slightly + // underestimate bytes to fill. This is the safe direction. + availableBytes := stat.Bavail * uint64(stat.Bsize) + + bytesToFill, err := i.computeBytesToFill(totalBytes, availableBytes) + if err != nil { + return fmt.Errorf("error computing bytes to fill: %w", err) + } + + // enforce 1Mi safety floor + if availableBytes > minFreeSpaceBytes && bytesToFill > availableBytes-minFreeSpaceBytes { + bytesToFill = availableBytes - minFreeSpaceBytes + i.config.Log.Infow("clamped fill size to enforce 1Mi safety floor", + "bytesToFill", bytesToFill, + "availableBytes", availableBytes, + ) + } + + if bytesToFill <= 0 { + i.config.Log.Infow("volume already at or past target fill level, skipping injection", + "totalBytes", totalBytes, + "availableBytes", availableBytes, + ) + + return nil + } + + if i.config.Disruption.DryRun { + i.config.Log.Infow("dry-run: would create ballast file", + "ballastPath", i.ballastPath, + "bytesToFill", bytesToFill, + ) + + return nil + } + + i.config.Log.Infow("injecting disk full disruption", + "path", i.hostPath, + "ballastPath", i.ballastPath, + "bytesToFill", bytesToFill, + "totalBytes", totalBytes, + "availableBytes", availableBytes, + ) + + // Create ballast file and allocate space using fallocate syscall. + // On Linux, this uses fallocate(2) which is instant (metadata-only). + // Falls back to writing zeros if the filesystem doesn't support fallocate. + file, err := os.Create(i.ballastPath) + if err != nil { + return fmt.Errorf("error creating ballast file %s: %w", i.ballastPath, err) + } + + defer func() { + if err := file.Close(); err != nil { + i.config.Log.Warnw("failed to close ballast file", "error", err) + } + }() + + if err := fallocate.Fallocate(file, 0, int64(bytesToFill)); err != nil { + // Clean up partial file on failure + if removeErr := os.Remove(i.ballastPath); removeErr != nil { + i.config.Log.Warnw("failed to clean up partial ballast file", "error", removeErr) + } + + return fmt.Errorf("error allocating disk space: %w", err) + } + + i.config.Log.Infow("disk full disruption injected successfully", + "ballastPath", i.ballastPath, + "bytesToFill", bytesToFill, + ) + + return nil +} + +func (i *diskFullInjector) computeBytesToFill(totalBytes, availableBytes uint64) (uint64, error) { + if i.spec.Capacity != "" { + percentStr := strings.TrimSuffix(i.spec.Capacity, "%") + + percent, err := strconv.Atoi(percentStr) + if err != nil { + return 0, fmt.Errorf("invalid capacity percentage %q: %w", i.spec.Capacity, err) + } + + usedBytes := totalBytes - availableBytes + targetUsed := totalBytes * uint64(percent) / 100 + + if targetUsed <= usedBytes { + return 0, nil + } + + return targetUsed - usedBytes, nil + } + + if i.spec.Remaining != "" { + qty, err := resource.ParseQuantity(i.spec.Remaining) + if err != nil { + return 0, fmt.Errorf("invalid remaining quantity %q: %w", i.spec.Remaining, err) + } + + remainingTarget := uint64(qty.Value()) + if availableBytes <= remainingTarget { + return 0, nil + } + + return availableBytes - remainingTarget, nil + } + + return 0, fmt.Errorf("either capacity or remaining must be set") +} + +func (i *diskFullInjector) UpdateConfig(config Config) { + i.config.Config = config +} + +func (i *diskFullInjector) Clean() error { + i.config.Log.Infow("cleaning disk full disruption", "ballastPath", i.ballastPath) + + if err := os.Remove(i.ballastPath); err != nil { + if errors.Is(err, os.ErrNotExist) { + i.config.Log.Infow("ballast file already removed", "ballastPath", i.ballastPath) + return nil + } + + return fmt.Errorf("error removing ballast file %s: %w", i.ballastPath, err) + } + + i.config.Log.Infow("disk full disruption cleaned successfully", "ballastPath", i.ballastPath) + + return nil +} diff --git a/injector/disk_full_test.go b/injector/disk_full_test.go new file mode 100644 index 0000000000..bec855da43 --- /dev/null +++ b/injector/disk_full_test.go @@ -0,0 +1,254 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package injector_test + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + chaosapi "github.com/DataDog/chaos-controller/api" + "github.com/DataDog/chaos-controller/api/v1beta1" + "github.com/DataDog/chaos-controller/env" + . "github.com/DataDog/chaos-controller/injector" + "github.com/DataDog/chaos-controller/types" +) + +var _ = Describe("DiskFull", func() { + var ( + config DiskFullInjectorConfig + inj Injector + spec v1beta1.DiskFullSpec + tmpDir string + ) + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "chaos-diskfull-test-*") + Expect(err).ToNot(HaveOccurred()) + + // env vars — set mount host to empty so hostPath = tmpDir directly + os.Setenv(env.InjectorMountHost, "") + + // config — node level to avoid needing container runtime mock + config = DiskFullInjectorConfig{ + Config: Config{ + Log: log, + MetricsSink: ms, + Disruption: chaosapi.DisruptionArgs{ + Level: types.DisruptionLevelNode, + DisruptionName: "test-disruption", + }, + }, + } + + spec = v1beta1.DiskFullSpec{ + Path: tmpDir, + Capacity: "95%", + } + }) + + AfterEach(func() { + os.Unsetenv(env.InjectorMountHost) + os.RemoveAll(tmpDir) + }) + + Describe("NewDiskFullInjector", func() { + It("should create an injector successfully", func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(inj).ToNot(BeNil()) + }) + + It("should return an error when the path does not exist", func() { + spec.Path = "/nonexistent/path/that/does/not/exist" + inj, err := NewDiskFullInjector(spec, config) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("does not exist")) + Expect(inj).To(BeNil()) + }) + + It("should return an error when mount host env var is not set", func() { + os.Unsetenv(env.InjectorMountHost) + inj, err := NewDiskFullInjector(spec, config) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(env.InjectorMountHost)) + Expect(inj).To(BeNil()) + }) + + It("should return the correct disruption kind", func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(string(inj.GetDisruptionKind())).To(Equal("disk-full")) + }) + }) + + Describe("Inject", func() { + JustBeforeEach(func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(inj).ToNot(BeNil()) + }) + + Context("with a small allocation that fits in available space", func() { + BeforeEach(func() { + // Compute a remaining value that will only allocate 1Mi. + // remaining = available - 1Mi, so bytesToFill = available - remaining = 1Mi. + var stat syscall.Statfs_t + err := syscall.Statfs(tmpDir, &stat) + Expect(err).ToNot(HaveOccurred()) + + availableBytes := stat.Bavail * uint64(stat.Bsize) + // Leave (available - 2Mi) as remaining, so we allocate ~2Mi minus safety floor = ~1Mi + targetRemaining := availableBytes - 2*1024*1024 + spec.Capacity = "" + spec.Remaining = formatBytes(targetRemaining) + }) + + It("should create a ballast file", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + info, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + Expect(info.Size()).To(BeNumerically(">", 0)) + }) + }) + + Context("with capacity mode", func() { + BeforeEach(func() { + // Use capacity mode: fill to 95% — allocate a tiny amount relative to the disk + spec.Capacity = "95%" + spec.Remaining = "" + }) + + It("should create a ballast file when disk is below target capacity", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + info, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + Expect(info.Size()).To(BeNumerically(">", 0)) + }) + }) + + Context("with remaining larger than available space", func() { + BeforeEach(func() { + spec.Capacity = "" + spec.Remaining = "999Ti" + }) + + It("should skip injection without error", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + + Context("with dry-run mode", func() { + BeforeEach(func() { + config.Disruption.DryRun = true + }) + + It("should not create a ballast file", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + }) + + Describe("Inject and Clean round trip", func() { + It("should create and then remove the ballast file", func() { + // Compute a remaining value that allocates only ~1Mi + var stat syscall.Statfs_t + err := syscall.Statfs(tmpDir, &stat) + Expect(err).ToNot(HaveOccurred()) + + availableBytes := stat.Bavail * uint64(stat.Bsize) + targetRemaining := availableBytes - 2*1024*1024 + spec.Capacity = "" + spec.Remaining = formatBytes(targetRemaining) + + inj, err := NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + + err = inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + + err = inj.Clean() + Expect(err).ToNot(HaveOccurred()) + + _, statErr = os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + + Describe("Clean", func() { + JustBeforeEach(func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(inj).ToNot(BeNil()) + }) + + Context("when ballast file exists", func() { + BeforeEach(func() { + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + err := os.WriteFile(ballastPath, []byte("ballast"), 0644) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should remove the ballast file", func() { + err := inj.Clean() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + + Context("when ballast file does not exist", func() { + It("should succeed without error (idempotent)", func() { + err := inj.Clean() + Expect(err).ToNot(HaveOccurred()) + }) + }) + }) +}) + +// formatBytes formats a byte count as a string suitable for resource.ParseQuantity +func formatBytes(bytes uint64) string { + if bytes >= 1024*1024*1024 { + return fmt.Sprintf("%dGi", bytes/(1024*1024*1024)) + } + + if bytes >= 1024*1024 { + return fmt.Sprintf("%dMi", bytes/(1024*1024)) + } + + return fmt.Sprintf("%d", bytes) +} diff --git a/safemode/safemode.go b/safemode/safemode.go index 90787d2c2e..a4649fc877 100644 --- a/safemode/safemode.go +++ b/safemode/safemode.go @@ -40,6 +40,12 @@ func AddAllSafemodeObjects(disruption v1beta1.Disruption, k8sClient client.Clien safemodeList = append(safemodeList, &safemodeDiskPressure) } + if disruption.Spec.DiskFull != nil { + safemodeDiskFull := DiskFull{} + safemodeDiskFull.Init(disruption, k8sClient) + safemodeList = append(safemodeList, &safemodeDiskFull) + } + if disruption.Spec.DiskFailure != nil { safemodeDiskFailure := DiskFailure{} safemodeDiskFailure.Init(disruption, k8sClient) diff --git a/safemode/safemode_disk_full.go b/safemode/safemode_disk_full.go new file mode 100644 index 0000000000..d605938124 --- /dev/null +++ b/safemode/safemode_disk_full.go @@ -0,0 +1,22 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package safemode + +import ( + "github.com/DataDog/chaos-controller/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type DiskFull struct { + dis v1beta1.Disruption + client client.Client +} + +// Init Refer to safemode.Safemode interface for documentation +func (sm *DiskFull) Init(disruption v1beta1.Disruption, client client.Client) { + sm.dis = disruption + sm.client = client +} diff --git a/services/chaospod.go b/services/chaospod.go index 317af67ab6..471cf73b98 100644 --- a/services/chaospod.go +++ b/services/chaospod.go @@ -8,6 +8,7 @@ package services import ( "context" "fmt" + "path/filepath" "reflect" "strings" "time" @@ -322,6 +323,11 @@ func (m *chaosPodService) GenerateChaosPodOfDisruption(disruption *chaosv1beta1. args = append(args, "--deadline", time.Now().Add(chaostypes.InjectorPadDuration).Add(disruption.RemainingDuration()).Format(time.RFC3339)) + var diskFullPath string + if kind == chaostypes.DisruptionKindDiskFull && disruption.Spec.DiskFull != nil { + diskFullPath = disruption.Spec.DiskFull.Path + } + chaosPod = corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: fmt.Sprintf("chaos-%s-", disruption.Name), // generate the pod name automatically with a prefix @@ -329,7 +335,7 @@ func (m *chaosPodService) GenerateChaosPodOfDisruption(disruption *chaosv1beta1. Annotations: m.config.Injector.Annotations, // add extra annotations passed to the controller Labels: m.generateLabels(disruption, targetName, kind), // add default and extra podLabels passed to the controller }, - Spec: m.generateChaosPodSpec(targetNodeName, terminationGracePeriod, activeDeadlineSeconds, args, hostPathDirectory, hostPathFile), + Spec: m.generateChaosPodSpec(targetNodeName, terminationGracePeriod, activeDeadlineSeconds, args, hostPathDirectory, hostPathFile, diskFullPath), } // add finalizer to the pod, so it is not deleted before we can control its exit status @@ -463,7 +469,7 @@ func (m *chaosPodService) generateLabels(disruption *chaosv1beta1.Disruption, ta return podLabels } -func (m *chaosPodService) generateChaosPodSpec(targetNodeName string, terminationGracePeriod int64, activeDeadlineSeconds int64, args []string, hostPathDirectory corev1.HostPathType, hostPathFile corev1.HostPathType) corev1.PodSpec { +func (m *chaosPodService) generateChaosPodSpec(targetNodeName string, terminationGracePeriod int64, activeDeadlineSeconds int64, args []string, hostPathDirectory corev1.HostPathType, hostPathFile corev1.HostPathType, diskFullPath string) corev1.PodSpec { podSpec := corev1.PodSpec{ HostPID: true, // enable host pid RestartPolicy: corev1.RestartPolicyNever, // do not restart the pod on fail or completion @@ -655,6 +661,25 @@ func (m *chaosPodService) generateChaosPodSpec(targetNodeName string, terminatio } } + // For disk-full disruptions, add a writable shadow mount for only the target path. + // The host root at /mnt/host stays read-only; this mounts the specific target directory + // as writable at /mnt/host/, so the injector's path resolution works unchanged. + if diskFullPath != "" { + podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{ + Name: "disk-full-target", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: diskFullPath, + Type: &hostPathDirectory, + }, + }, + }) + podSpec.Containers[0].VolumeMounts = append(podSpec.Containers[0].VolumeMounts, corev1.VolumeMount{ + Name: "disk-full-target", + MountPath: filepath.Join("/mnt/host", diskFullPath), + }) + } + return podSpec } diff --git a/types/types.go b/types/types.go index 06d1be1038..d6097c6a18 100644 --- a/types/types.go +++ b/types/types.go @@ -75,6 +75,8 @@ const ( DisruptionKindCPUStress = "cpu-pressure-stress" // DisruptionKindDiskFailure is a disk failure disruption DisruptionKindDiskFailure = "disk-failure" + // DisruptionKindDiskFull is a disk full (ENOSPC) disruption + DisruptionKindDiskFull = "disk-full" // DisruptionKindDiskPressure is a disk pressure disruption DisruptionKindDiskPressure = "disk-pressure" // DisruptionKindGRPCDisruption is a grpc disruption @@ -161,6 +163,7 @@ var DisruptionKindNames = []DisruptionKindName{ DisruptionKindCPUPressure, DisruptionKindMemoryPressure, DisruptionKindDiskPressure, + DisruptionKindDiskFull, DisruptionKindDiskFailure, DisruptionKindGRPCDisruption, DisruptionKindDNSDisruption,