From 6109573d5f24761af448b514db45a6b61700ff6c Mon Sep 17 00:00:00 2001 From: Thibault NORMAND Date: Wed, 8 Apr 2026 17:01:08 +0200 Subject: [PATCH 1/4] feat(disruption): disk full injection. Signed-off-by: Thibault NORMAND Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Thibault NORMAND --- api/v1beta1/disk_full.go | 226 ++++++++++++ api/v1beta1/disk_full_test.go | 316 +++++++++++++++++ api/v1beta1/disruption_types.go | 8 + api/v1beta1/disruption_webhook.go | 31 ++ api/v1beta1/safemode.go | 1 + api/v1beta1/zz_generated.deepcopy.go | 40 +++ .../chaos.datadoghq.com_disruptioncrons.yaml | 46 +++ ...haos.datadoghq.com_disruptionrollouts.yaml | 46 +++ .../chaos.datadoghq.com_disruptions.yaml | 46 +++ cli/injector/disk_full.go | 74 ++++ cli/injector/main.go | 1 + docs/disk_full.md | 163 +++++++++ docs/disruption_catalogue.md | 65 +++- ebpf/const-arm.go | 1 + ebpf/const-x64.go | 1 + ebpf/disk-full-write/injection.bpf.c | 102 ++++++ ebpf/disk-full-write/injection.bpf.h | 10 + ebpf/disk-full-write/main.go | 130 +++++++ fallocate/fallocate_darwin.go | 40 +++ fallocate/fallocate_linux.go | 37 ++ fallocate/fallocate_other.go | 19 + fallocate/write.go | 38 ++ injector/disk_full.go | 326 ++++++++++++++++++ injector/disk_full_test.go | 222 ++++++++++++ safemode/safemode.go | 6 + safemode/safemode_disk_full.go | 22 ++ types/types.go | 3 + 27 files changed, 2011 insertions(+), 9 deletions(-) create mode 100644 api/v1beta1/disk_full.go create mode 100644 api/v1beta1/disk_full_test.go create mode 100644 cli/injector/disk_full.go create mode 100644 docs/disk_full.md create mode 100644 ebpf/disk-full-write/injection.bpf.c create mode 100644 ebpf/disk-full-write/injection.bpf.h create mode 100644 ebpf/disk-full-write/main.go create mode 100644 fallocate/fallocate_darwin.go create mode 100644 fallocate/fallocate_linux.go create mode 100644 fallocate/fallocate_other.go create mode 100644 fallocate/write.go create mode 100644 injector/disk_full.go create mode 100644 injector/disk_full_test.go create mode 100644 safemode/safemode_disk_full.go diff --git a/api/v1beta1/disk_full.go b/api/v1beta1/disk_full.go new file mode 100644 index 0000000000..13477df979 --- /dev/null +++ b/api/v1beta1/disk_full.go @@ -0,0 +1,226 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package v1beta1 + +import ( + "fmt" + "strconv" + "strings" + + "github.com/hashicorp/go-multierror" + "k8s.io/apimachinery/pkg/api/resource" +) + +// DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume +type DiskFullSpec struct { + // Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Path string `json:"path" chaos_validate:"required"` + // Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + // Mutually exclusive with Remaining. + // +kubebuilder:validation:Pattern=`^\d{1,3}%$` + Capacity string `json:"capacity,omitempty"` + // Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + // Mutually exclusive with Capacity. + Remaining string `json:"remaining,omitempty"` + // WriteSyscall optionally enables eBPF-based write syscall interception to return errors + // with configurable probability. This runs in addition to the volume fill. + // +nullable + WriteSyscall *WriteSyscallSpec `json:"writeSyscall,omitempty"` +} + +// WriteSyscallSpec configures eBPF-based interception of write syscalls (write, pwrite64) +// to return a configurable error code with a given probability. +type WriteSyscallSpec struct { + // ExitCode is the errno to return on intercepted write syscalls. + // +kubebuilder:validation:Enum=ENOSPC;EDQUOT;EIO;EROFS;EFBIG;EPERM;EACCES + // +kubebuilder:default=ENOSPC + ExitCode string `json:"exitCode,omitempty" chaos_validate:"omitempty,oneofci=ENOSPC EDQUOT EIO EROFS EFBIG EPERM EACCES"` + // Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%". + Probability string `json:"probability,omitempty"` +} + +// GetExitCodeInt returns the integer value of the configured errno. +func (s *WriteSyscallSpec) GetExitCodeInt() int { + switch s.ExitCode { + case "ENOSPC": + return 28 + case "EDQUOT": + return 122 + case "EIO": + return 5 + case "EROFS": + return 30 + case "EFBIG": + return 27 + case "EPERM": + return 1 + case "EACCES": + return 13 + default: + return 28 // ENOSPC + } +} + +// Validate validates args for the given disruption +func (s *DiskFullSpec) Validate() (retErr error) { + if strings.TrimSpace(s.Path) == "" { + retErr = multierror.Append(retErr, fmt.Errorf("the path of the disk full disruption must not be empty")) + } + + hasCapacity := s.Capacity != "" + hasRemaining := s.Remaining != "" + + if hasCapacity && hasRemaining { + retErr = multierror.Append(retErr, fmt.Errorf("capacity and remaining are mutually exclusive, only one can be set")) + } + + if !hasCapacity && !hasRemaining { + retErr = multierror.Append(retErr, fmt.Errorf("one of capacity or remaining must be set")) + } + + if hasCapacity { + if err := validateCapacity(s.Capacity); err != nil { + retErr = multierror.Append(retErr, err) + } + } + + if hasRemaining { + if err := validateRemaining(s.Remaining); err != nil { + retErr = multierror.Append(retErr, err) + } + } + + if s.WriteSyscall != nil { + if err := validateWriteSyscallProbability(s.WriteSyscall.Probability); err != nil { + retErr = multierror.Append(retErr, err) + } + } + + return retErr +} + +func validateCapacity(capacity string) error { + if !strings.HasSuffix(capacity, "%") { + return fmt.Errorf("capacity must be a percentage suffixed with %%, got %q", capacity) + } + + valueStr := strings.TrimSuffix(capacity, "%") + + value, err := strconv.Atoi(valueStr) + if err != nil { + return fmt.Errorf("capacity percentage must be an integer, got %q: %w", valueStr, err) + } + + if value < 1 || value > 100 { + return fmt.Errorf("capacity percentage must be between 1 and 100, got %d", value) + } + + return nil +} + +func validateRemaining(remaining string) error { + qty, err := resource.ParseQuantity(remaining) + if err != nil { + return fmt.Errorf("remaining must be a valid Kubernetes resource quantity (e.g., 50Mi, 1Gi), got %q: %w", remaining, err) + } + + if qty.Value() < 0 { + return fmt.Errorf("remaining must not be negative, got %q", remaining) + } + + return nil +} + +func validateWriteSyscallProbability(probability string) error { + if probability == "" { + return nil + } + + if !strings.HasSuffix(probability, "%") { + return fmt.Errorf("writeSyscall probability must be a percentage suffixed with %%, got %q", probability) + } + + valueStr := strings.TrimSuffix(probability, "%") + + value, err := strconv.Atoi(valueStr) + if err != nil { + return fmt.Errorf("writeSyscall probability must be an integer, got %q: %w", valueStr, err) + } + + if value < 1 || value > 100 { + return fmt.Errorf("writeSyscall probability must be between 1 and 100, got %d", value) + } + + return nil +} + +// GenerateArgs generates injection or cleanup pod arguments for the given spec +func (s *DiskFullSpec) GenerateArgs() []string { + args := []string{ + "disk-full", + "--path", + s.Path, + } + + if s.Capacity != "" { + args = append(args, "--capacity", s.Capacity) + } + + if s.Remaining != "" { + args = append(args, "--remaining", s.Remaining) + } + + if s.WriteSyscall != nil { + exitCode := s.WriteSyscall.ExitCode + if exitCode == "" { + exitCode = "ENOSPC" + } + + args = append(args, "--write-exit-code", exitCode) + + probability := s.WriteSyscall.Probability + if probability == "" { + probability = "100%" + } + + args = append(args, "--write-probability", probability) + } + + return args +} + +// Explain returns a human-readable description of the disruption +func (s *DiskFullSpec) Explain() []string { + explanation := fmt.Sprintf("spec.diskFull will fill the volume mounted at %s", s.Path) + + if s.Capacity != "" { + explanation += fmt.Sprintf(" to %s of its total capacity", s.Capacity) + } + + if s.Remaining != "" { + explanation += fmt.Sprintf(", leaving only %s of free space", s.Remaining) + } + + explanation += ", causing ENOSPC errors on subsequent write operations." + + if s.WriteSyscall != nil { + exitCode := s.WriteSyscall.ExitCode + if exitCode == "" { + exitCode = "ENOSPC" + } + + probability := s.WriteSyscall.Probability + if probability == "" { + probability = "100%" + } + + explanation += fmt.Sprintf(" Additionally, write syscalls will be intercepted via eBPF and return %s %s of the time.", exitCode, probability) + } + + return []string{"", explanation} +} diff --git a/api/v1beta1/disk_full_test.go b/api/v1beta1/disk_full_test.go new file mode 100644 index 0000000000..6aac922c33 --- /dev/null +++ b/api/v1beta1/disk_full_test.go @@ -0,0 +1,316 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package v1beta1_test + +import ( + . "github.com/DataDog/chaos-controller/api/v1beta1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("DiskFullSpec", func() { + When("Call the 'Validate' method", func() { + DescribeTable("success cases", + func(spec DiskFullSpec) { + Expect(spec.Validate()).Should(Succeed()) + }, + Entry("with capacity percentage", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + }, + ), + Entry("with capacity at 1%", + DiskFullSpec{ + Path: "/data", + Capacity: "1%", + }, + ), + Entry("with capacity at 100%", + DiskFullSpec{ + Path: "/data", + Capacity: "100%", + }, + ), + Entry("with remaining in Mi", + DiskFullSpec{ + Path: "/data", + Remaining: "50Mi", + }, + ), + Entry("with remaining in Gi", + DiskFullSpec{ + Path: "/var/log", + Remaining: "1Gi", + }, + ), + Entry("with remaining at 0", + DiskFullSpec{ + Path: "/data", + Remaining: "0", + }, + ), + Entry("with writeSyscall defaults", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{}, + }, + ), + Entry("with writeSyscall explicit values", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{ + ExitCode: "ENOSPC", + Probability: "50%", + }, + }, + ), + Entry("with writeSyscall EDQUOT exit code", + DiskFullSpec{ + Path: "/data", + Capacity: "80%", + WriteSyscall: &WriteSyscallSpec{ + ExitCode: "EDQUOT", + Probability: "100%", + }, + }, + ), + ) + + DescribeTable("error cases", + func(spec DiskFullSpec, expectedErrors []string) { + err := spec.Validate() + Expect(err).To(HaveOccurred()) + for _, expected := range expectedErrors { + Expect(err.Error()).To(ContainSubstring(expected)) + } + }, + Entry("with empty path", + DiskFullSpec{ + Path: "", + Capacity: "95%", + }, + []string{"the path of the disk full disruption must not be empty"}, + ), + Entry("with blank path", + DiskFullSpec{ + Path: " ", + Capacity: "95%", + }, + []string{"the path of the disk full disruption must not be empty"}, + ), + Entry("with both capacity and remaining set", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + Remaining: "50Mi", + }, + []string{"capacity and remaining are mutually exclusive"}, + ), + Entry("with neither capacity nor remaining set", + DiskFullSpec{ + Path: "/data", + }, + []string{"one of capacity or remaining must be set"}, + ), + Entry("with capacity missing percent suffix", + DiskFullSpec{ + Path: "/data", + Capacity: "95", + }, + []string{"capacity must be a percentage suffixed with %"}, + ), + Entry("with capacity at 0%", + DiskFullSpec{ + Path: "/data", + Capacity: "0%", + }, + []string{"capacity percentage must be between 1 and 100"}, + ), + Entry("with capacity at 101%", + DiskFullSpec{ + Path: "/data", + Capacity: "101%", + }, + []string{"capacity percentage must be between 1 and 100"}, + ), + Entry("with non-numeric capacity", + DiskFullSpec{ + Path: "/data", + Capacity: "abc%", + }, + []string{"capacity percentage must be an integer"}, + ), + Entry("with invalid remaining quantity", + DiskFullSpec{ + Path: "/data", + Remaining: "not-a-quantity", + }, + []string{"remaining must be a valid Kubernetes resource quantity"}, + ), + Entry("with negative remaining", + DiskFullSpec{ + Path: "/data", + Remaining: "-1Mi", + }, + []string{"remaining must not be negative"}, + ), + Entry("with empty path and no capacity/remaining", + DiskFullSpec{ + Path: "", + }, + []string{ + "the path of the disk full disruption must not be empty", + "one of capacity or remaining must be set", + }, + ), + Entry("with writeSyscall invalid probability (no suffix)", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{ + Probability: "50", + }, + }, + []string{"writeSyscall probability must be a percentage suffixed with %"}, + ), + Entry("with writeSyscall probability 0%", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{ + Probability: "0%", + }, + }, + []string{"writeSyscall probability must be between 1 and 100"}, + ), + Entry("with writeSyscall probability 101%", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{ + Probability: "101%", + }, + }, + []string{"writeSyscall probability must be between 1 and 100"}, + ), + Entry("with writeSyscall non-numeric probability", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{ + Probability: "abc%", + }, + }, + []string{"writeSyscall probability must be an integer"}, + ), + ) + }) + + When("Call the 'GenerateArgs' method", func() { + DescribeTable("success cases", + func(spec DiskFullSpec, expectedArgs []string) { + expectedArgs = append([]string{"disk-full"}, expectedArgs...) + args := spec.GenerateArgs() + Expect(args).Should(Equal(expectedArgs)) + }, + Entry("with capacity", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + }, + []string{"--path", "/data", "--capacity", "95%"}, + ), + Entry("with remaining", + DiskFullSpec{ + Path: "/data", + Remaining: "50Mi", + }, + []string{"--path", "/data", "--remaining", "50Mi"}, + ), + Entry("with writeSyscall defaults", + DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{}, + }, + []string{"--path", "/data", "--capacity", "95%", "--write-exit-code", "ENOSPC", "--write-probability", "100%"}, + ), + Entry("with writeSyscall explicit values", + DiskFullSpec{ + Path: "/data", + Capacity: "90%", + WriteSyscall: &WriteSyscallSpec{ + ExitCode: "EDQUOT", + Probability: "50%", + }, + }, + []string{"--path", "/data", "--capacity", "90%", "--write-exit-code", "EDQUOT", "--write-probability", "50%"}, + ), + ) + }) + + When("Call the 'Explain' method", func() { + It("explains capacity mode", func() { + spec := DiskFullSpec{ + Path: "/data", + Capacity: "95%", + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("/data")) + Expect(explanation[1]).To(ContainSubstring("95%")) + Expect(explanation[1]).To(ContainSubstring("ENOSPC")) + }) + + It("explains remaining mode", func() { + spec := DiskFullSpec{ + Path: "/var/log", + Remaining: "50Mi", + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("/var/log")) + Expect(explanation[1]).To(ContainSubstring("50Mi")) + Expect(explanation[1]).To(ContainSubstring("ENOSPC")) + }) + + It("explains writeSyscall mode", func() { + spec := DiskFullSpec{ + Path: "/data", + Capacity: "95%", + WriteSyscall: &WriteSyscallSpec{ + ExitCode: "ENOSPC", + Probability: "50%", + }, + } + explanation := spec.Explain() + Expect(explanation).To(HaveLen(2)) + Expect(explanation[1]).To(ContainSubstring("eBPF")) + Expect(explanation[1]).To(ContainSubstring("ENOSPC")) + Expect(explanation[1]).To(ContainSubstring("50%")) + }) + }) + + Describe("WriteSyscallSpec", func() { + DescribeTable("GetExitCodeInt", + func(exitCode string, expected int) { + spec := WriteSyscallSpec{ExitCode: exitCode} + Expect(spec.GetExitCodeInt()).To(Equal(expected)) + }, + Entry("ENOSPC", "ENOSPC", 28), + Entry("EDQUOT", "EDQUOT", 122), + Entry("EIO", "EIO", 5), + Entry("EROFS", "EROFS", 30), + Entry("EFBIG", "EFBIG", 27), + Entry("EPERM", "EPERM", 1), + Entry("EACCES", "EACCES", 13), + Entry("empty defaults to ENOSPC", "", 28), + ) + }) +}) diff --git a/api/v1beta1/disruption_types.go b/api/v1beta1/disruption_types.go index ac7a6d342b..5c8c68c8fd 100644 --- a/api/v1beta1/disruption_types.go +++ b/api/v1beta1/disruption_types.go @@ -80,6 +80,8 @@ type DisruptionSpec struct { // +nullable DiskPressure *DiskPressureSpec `json:"diskPressure,omitempty"` // +nullable + DiskFull *DiskFullSpec `json:"diskFull,omitempty"` + // +nullable DiskFailure *DiskFailureSpec `json:"diskFailure,omitempty"` // +nullable GRPC *GRPCDisruptionSpec `json:"grpc,omitempty"` @@ -805,6 +807,10 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re retErr = multierror.Append(retErr, errors.New("DNS disruptions can only be applied at the pod level")) } + if s.DiskFull != nil && s.Level == chaostypes.DisruptionLevelNode { + retErr = multierror.Append(retErr, errors.New("disk full disruptions can only be applied at the pod level")) + } + // Rule: count must be valid if err := ValidateCount(s.Count); err != nil { retErr = multierror.Append(retErr, err) @@ -842,6 +848,8 @@ func (s DisruptionSpec) DisruptionKindPicker(kind chaostypes.DisruptionKindName) disruptionKind = s.DNS case chaostypes.DisruptionKindPodReplacement: disruptionKind = s.PodReplacement + case chaostypes.DisruptionKindDiskFull: + disruptionKind = s.DiskFull case chaostypes.DisruptionKindDiskFailure: disruptionKind = s.DiskFailure } diff --git a/api/v1beta1/disruption_webhook.go b/api/v1beta1/disruption_webhook.go index 187baa0dba..9ecec7d839 100644 --- a/api/v1beta1/disruption_webhook.go +++ b/api/v1beta1/disruption_webhook.go @@ -19,6 +19,7 @@ import ( "go.opentelemetry.io/otel/trace" "go.uber.org/zap" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" @@ -505,6 +506,14 @@ func (d *Disruption) initialSafetyNets() ([]string, error) { responses = append(responses, "the specified path for the disk failure disruption targeting a node must not be \"/\"") } } + + if d.Spec.DiskFull != nil { + if caught, response := safetyNetDiskFullMinFreeSpace(d); caught { + logger.Debugw("the specified disk full disruption breaches the minimum free space safety floor", tagutil.SafetyNetCatchKey, "DiskFull") + + responses = append(responses, response) + } + } } if !allowNodeFailure && d.Spec.NodeFailure != nil { @@ -733,6 +742,28 @@ func safetyNetAttemptsNodeRootDiskFailure(r *Disruption) bool { return false } +// safetyNetDiskFullMinFreeSpace checks that the disk full disruption does not breach the 1Mi minimum free space floor. +func safetyNetDiskFullMinFreeSpace(r *Disruption) (bool, string) { + if r.Spec.Unsafemode != nil && r.Spec.Unsafemode.AllowDiskFullNoFloor { + return false, "" + } + + if r.Spec.DiskFull.Capacity == "100%" { + return true, "disk full disruption with 100% capacity will leave 0 bytes free; " + + "set unsafeMode.allowDiskFullNoFloor=true to override the 1Mi safety floor" + } + + if r.Spec.DiskFull.Remaining != "" { + qty, err := resource.ParseQuantity(r.Spec.DiskFull.Remaining) + if err == nil && qty.Value() < 1024*1024 { + return true, fmt.Sprintf("disk full disruption remaining space %s is below the 1Mi safety floor; "+ + "set unsafeMode.allowDiskFullNoFloor=true to override", r.Spec.DiskFull.Remaining) + } + } + + return false, "" +} + // checkForDisabledDisruptions returns an error if `r` specifies any of the disruption kinds in setupWebhookConfig.DisabledDisruptions func checkForDisabledDisruptions(r *Disruption) error { for _, disKind := range chaostypes.DisruptionKindNames { diff --git a/api/v1beta1/safemode.go b/api/v1beta1/safemode.go index a174d6051e..805b3dbf1c 100644 --- a/api/v1beta1/safemode.go +++ b/api/v1beta1/safemode.go @@ -17,6 +17,7 @@ type UnsafemodeSpec struct { DisableNeitherHostNorPort bool `json:"disableNeitherHostNorPort,omitempty"` DisableSpecificContainDisk bool `json:"disableSpecificContainDisk,omitempty"` AllowRootDiskFailure bool `json:"allowRootDiskFailure,omitempty"` + AllowDiskFullNoFloor bool `json:"allowDiskFullNoFloor,omitempty"` Config *Config `json:"config,omitempty"` } diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index c9271bd209..6665c482b8 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -183,6 +183,26 @@ func (in *DiskFailureSpec) DeepCopy() *DiskFailureSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiskFullSpec) DeepCopyInto(out *DiskFullSpec) { + *out = *in + if in.WriteSyscall != nil { + in, out := &in.WriteSyscall, &out.WriteSyscall + *out = new(WriteSyscallSpec) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskFullSpec. +func (in *DiskFullSpec) DeepCopy() *DiskFullSpec { + if in == nil { + return nil + } + out := new(DiskFullSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DiskPressureSpec) DeepCopyInto(out *DiskPressureSpec) { *out = *in @@ -646,6 +666,11 @@ func (in *DisruptionSpec) DeepCopyInto(out *DisruptionSpec) { *out = new(DiskPressureSpec) (*in).DeepCopyInto(*out) } + if in.DiskFull != nil { + in, out := &in.DiskFull, &out.DiskFull + *out = new(DiskFullSpec) + (*in).DeepCopyInto(*out) + } if in.DiskFailure != nil { in, out := &in.DiskFailure, &out.DiskFailure *out = new(DiskFailureSpec) @@ -1212,3 +1237,18 @@ func (in *UnsafemodeSpec) DeepCopy() *UnsafemodeSpec { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WriteSyscallSpec) DeepCopyInto(out *WriteSyscallSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WriteSyscallSpec. +func (in *WriteSyscallSpec) DeepCopy() *WriteSyscallSpec { + if in == nil { + return nil + } + out := new(WriteSyscallSpec) + in.DeepCopyInto(out) + return out +} diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml index 14af5850d5..19799d71c7 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml @@ -162,6 +162,50 @@ spec: required: - paths type: object + diskFull: + description: DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume + nullable: true + properties: + capacity: + description: |- + Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + Mutually exclusive with Remaining. + pattern: ^\d{1,3}%$ + type: string + path: + description: Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + minLength: 1 + type: string + remaining: + description: |- + Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + Mutually exclusive with Capacity. + type: string + writeSyscall: + description: |- + WriteSyscall optionally enables eBPF-based write syscall interception to return errors + with configurable probability. This runs in addition to the volume fill. + nullable: true + properties: + exitCode: + default: ENOSPC + description: ExitCode is the errno to return on intercepted write syscalls. + enum: + - ENOSPC + - EDQUOT + - EIO + - EROFS + - EFBIG + - EPERM + - EACCES + type: string + probability: + description: 'Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%".' + type: string + type: object + required: + - path + type: object diskPressure: description: DiskPressureSpec represents a disk pressure disruption nullable: true @@ -689,6 +733,8 @@ spec: UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true properties: + allowDiskFullNoFloor: + type: boolean allowRootDiskFailure: type: boolean config: diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml index e9f636d1f6..cc530f81e0 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml @@ -163,6 +163,50 @@ spec: required: - paths type: object + diskFull: + description: DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume + nullable: true + properties: + capacity: + description: |- + Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + Mutually exclusive with Remaining. + pattern: ^\d{1,3}%$ + type: string + path: + description: Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + minLength: 1 + type: string + remaining: + description: |- + Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + Mutually exclusive with Capacity. + type: string + writeSyscall: + description: |- + WriteSyscall optionally enables eBPF-based write syscall interception to return errors + with configurable probability. This runs in addition to the volume fill. + nullable: true + properties: + exitCode: + default: ENOSPC + description: ExitCode is the errno to return on intercepted write syscalls. + enum: + - ENOSPC + - EDQUOT + - EIO + - EROFS + - EFBIG + - EPERM + - EACCES + type: string + probability: + description: 'Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%".' + type: string + type: object + required: + - path + type: object diskPressure: description: DiskPressureSpec represents a disk pressure disruption nullable: true @@ -690,6 +734,8 @@ spec: UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true properties: + allowDiskFullNoFloor: + type: boolean allowRootDiskFailure: type: boolean config: diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml index fa6b4be1e3..0f6ba0051b 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml @@ -153,6 +153,50 @@ spec: required: - paths type: object + diskFull: + description: DiskFullSpec represents a disk full (ENOSPC) disruption that fills a target volume + nullable: true + properties: + capacity: + description: |- + Capacity is the target fill percentage of total volume capacity (e.g., "95%"). + Mutually exclusive with Remaining. + pattern: ^\d{1,3}%$ + type: string + path: + description: Path is the mount path inside the target pod to fill (e.g., "/data", "/var/log") + minLength: 1 + type: string + remaining: + description: |- + Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). + Mutually exclusive with Capacity. + type: string + writeSyscall: + description: |- + WriteSyscall optionally enables eBPF-based write syscall interception to return errors + with configurable probability. This runs in addition to the volume fill. + nullable: true + properties: + exitCode: + default: ENOSPC + description: ExitCode is the errno to return on intercepted write syscalls. + enum: + - ENOSPC + - EDQUOT + - EIO + - EROFS + - EFBIG + - EPERM + - EACCES + type: string + probability: + description: 'Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%".' + type: string + type: object + required: + - path + type: object diskPressure: description: DiskPressureSpec represents a disk pressure disruption nullable: true @@ -680,6 +724,8 @@ spec: UnsafemodeSpec represents a spec with parameters to turn off specific safety nets designed to catch common traps or issues running a disruption All of these are turned off by default, so disabling safety nets requires manually changing these booleans to true properties: + allowDiskFullNoFloor: + type: boolean allowRootDiskFailure: type: boolean config: diff --git a/cli/injector/disk_full.go b/cli/injector/disk_full.go new file mode 100644 index 0000000000..3f82ec9939 --- /dev/null +++ b/cli/injector/disk_full.go @@ -0,0 +1,74 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package main + +import ( + "errors" + "os" + "strings" + + "github.com/spf13/cobra" + + "github.com/DataDog/chaos-controller/api/v1beta1" + "github.com/DataDog/chaos-controller/injector" + "github.com/DataDog/chaos-controller/o11y/tags" +) + +var diskFullCmd = &cobra.Command{ + Use: "disk-full", + Short: "Disk full (ENOSPC) subcommand", + Run: injectAndWait, + PreRun: func(cmd *cobra.Command, args []string) { + path, _ := cmd.Flags().GetString("path") + capacity, _ := cmd.Flags().GetString("capacity") + remaining, _ := cmd.Flags().GetString("remaining") + writeExitCode, _ := cmd.Flags().GetString("write-exit-code") + writeProbability, _ := cmd.Flags().GetString("write-probability") + + spec := v1beta1.DiskFullSpec{ + Path: path, + Capacity: capacity, + Remaining: remaining, + } + + if writeExitCode != "" || writeProbability != "" { + spec.WriteSyscall = &v1beta1.WriteSyscallSpec{ + ExitCode: writeExitCode, + Probability: writeProbability, + } + } + + for _, config := range configs { + inj, err := injector.NewDiskFullInjector(spec, injector.DiskFullInjectorConfig{Config: config}) + if err != nil { + if errors.Is(errors.Unwrap(err), os.ErrNotExist) || strings.Contains(err.Error(), "No such file or directory") { + log.Errorw("error initializing the disk full injector because the given path does not exist", tags.ErrorKey, err) + } else if errors.Is(errors.Unwrap(err), os.ErrPermission) { + log.Errorw("error initializing the disk full injector because the given path is not accessible", tags.ErrorKey, err) + } else { + log.Fatalw("error initializing the disk full injector", tags.ErrorKey, err) + } + } + + if inj == nil { + log.Debugln("skipping this injector because path cannot be found on specified container") + continue + } + + injectors = append(injectors, inj) + } + }, +} + +func init() { + diskFullCmd.Flags().String("path", "", "Path to apply disk full disruption to") + diskFullCmd.Flags().String("capacity", "", "Target fill percentage of total volume capacity (e.g., 95%)") + diskFullCmd.Flags().String("remaining", "", "Amount of free space to leave on the volume (e.g., 50Mi)") + diskFullCmd.Flags().String("write-exit-code", "", "Error code to return on intercepted write syscalls (e.g., ENOSPC)") + diskFullCmd.Flags().String("write-probability", "", "Percentage of write syscalls to fail (e.g., 50%)") + + _ = cobra.MarkFlagRequired(diskFullCmd.PersistentFlags(), "path") +} diff --git a/cli/injector/main.go b/cli/injector/main.go index 69d91f603b..19b67ea2d5 100644 --- a/cli/injector/main.go +++ b/cli/injector/main.go @@ -90,6 +90,7 @@ func init() { rootCmd.AddCommand(memoryPressureCmd) rootCmd.AddCommand(memoryPressureStressCmd) rootCmd.AddCommand(diskFailureCmd) + rootCmd.AddCommand(diskFullCmd) rootCmd.AddCommand(diskPressureCmd) rootCmd.AddCommand(grpcDisruptionCmd) rootCmd.AddCommand(dnsDisruptionCmd) diff --git a/docs/disk_full.md b/docs/disk_full.md new file mode 100644 index 0000000000..a0ace4f7b6 --- /dev/null +++ b/docs/disk_full.md @@ -0,0 +1,163 @@ +# Disk full (ENOSPC) + +The `diskFull` field offers a way to genuinely fill a target pod volume, causing real ENOSPC errors on all subsequent write operations. Unlike disk pressure (which throttles I/O) or disk failure (which intercepts `openat` syscalls), this disruption makes the filesystem actually run out of space — visible to `df`, `statfs()`, Kubernetes eviction, and monitoring systems. + +## How it works + +The injector creates a **ballast file** (`.chaos-diskfull-{disruption-name}`) at the target path using the `fallocate(2)` syscall, which is instant (O(1), metadata-only allocation on ext4/xfs). On filesystems that don't support `fallocate`, it falls back to writing zeros. + +When the disruption is cleaned up, the ballast file is removed and space is freed immediately. + +## Spec fields + +| Field | Type | Required | Description | +|-------------|--------|----------|-------------| +| `path` | string | Yes | Mount path inside the target pod to fill (e.g., `/data`, `/var/log`) | +| `capacity` | string | One of | Fill to this percentage of total volume capacity (e.g., `"95%"`) | +| `remaining` | string | One of | Leave only this much free space on the volume (e.g., `"50Mi"`, `"1Gi"`) | +| `writeSyscall` | object | No | Optional eBPF-based write syscall interception (see below) | + +`capacity` and `remaining` are **mutually exclusive** — exactly one must be set. + +### writeSyscall (optional) + +When set, an eBPF program is launched alongside the volume fill to intercept `write` syscalls and return errors with configurable probability. This is useful for testing partial write failures or for environments where the volume fill alone isn't sufficient. + +| Field | Type | Default | Description | +|---------------|--------|-----------|-------------| +| `exitCode` | string | `ENOSPC` | errno to return: `ENOSPC`, `EDQUOT`, `EIO`, `EROFS`, `EFBIG`, `EPERM`, `EACCES` | +| `probability` | string | `"100%"` | Percentage of write syscalls to fail (1-100%) | + +**Requirements:** The kernel must support eBPF with `CONFIG_BPF_KPROBE_OVERRIDE` enabled. + +## Examples + +### Fill to 95% capacity + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-test + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 10m + diskFull: + path: "/data" + capacity: "95%" +``` + +### Leave only 10Mi free + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-remaining + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 5m + diskFull: + path: "/var/log" + remaining: "10Mi" +``` + +### Volume fill + eBPF write interception + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-with-ebpf + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 10m + diskFull: + path: "/data" + capacity: "90%" + writeSyscall: + exitCode: ENOSPC + probability: "50%" +``` + +### Fill to 100% (requires unsafeMode) + +By default, the controller enforces a 1Mi minimum free space safety floor to prevent filesystem journal corruption. To fill completely: + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-complete + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 5m + unsafeMode: + allowDiskFullNoFloor: true + diskFull: + path: "/data" + capacity: "100%" +``` + +## Safety + +### Minimum free space floor + +A 1Mi safety floor is enforced by default. This prevents: +- Filesystem journal corruption on ext4 +- Inability to perform cleanup operations +- Cascade failures from completely exhausted filesystems + +Override with `unsafeMode.allowDiskFullNoFloor: true`. + +### Ephemeral storage eviction + +If the target pod has `resources.limits.ephemeral-storage` set and the target volume is ephemeral (`emptyDir`), filling the volume may cause the kubelet to evict the pod. This is **realistic behavior** — it's exactly what would happen if the application itself filled the disk. + +The controller emits a warning Kubernetes Event when this condition is detected, but does not block the disruption. + +### Level restriction + +Disk full disruptions are **pod-level only**. Node-level disk fill is not supported because it can crash the kubelet and affect all pods on the node. + +## Manual cleanup instructions + +If the chaos pod crashes before cleanup and the finalizer fails: + +1. Identify the ballast file on the target node: + +```shell +find /var/lib/kubelet/pods/ -name ".chaos-diskfull-*" -type f +``` + +2. Remove it: + +```shell +rm /path/to/.chaos-diskfull- +``` + +Space is freed immediately upon file removal. + +## Comparison with other disk disruptions + +| Disruption | Mechanism | ENOSPC on writes? | Visible to `df`/monitoring? | Affects open FDs? | +|---|---|---|---|---| +| **Disk Pressure** | Cgroup blkio throttling | No (slows I/O only) | No | N/A | +| **Disk Failure** | eBPF on `openat` | Only on file open | No | No | +| **Disk Full** | Real space allocation | Yes (all syscalls) | Yes | Yes | diff --git a/docs/disruption_catalogue.md b/docs/disruption_catalogue.md index 547fed4c80..fe87fdd0fb 100644 --- a/docs/disruption_catalogue.md +++ b/docs/disruption_catalogue.md @@ -15,14 +15,15 @@ | [CPU Pressure](#4-cpu-pressure) | Consume CPU cycles in target cgroup | Pod, Node | Yes | Yes | | [Memory Pressure](#5-memory-pressure) | Gradually consume memory in target cgroup | Pod, Node | Yes | Yes | | [Disk Pressure](#6-disk-pressure) | Throttle read/write I/O throughput | Pod, Node | Yes | Yes | -| [Disk Failure](#7-disk-failure) | Fail file open syscalls via eBPF | Pod, Node | Yes* | Yes | -| [Container Failure](#8-container-failure) | Kill container processes (SIGTERM/SIGKILL) | Pod | No | No (exclusive) | -| [Node Failure](#9-node-failure) | Kernel panic or power-off a node | Node | No | No (exclusive) | -| [Pod Replacement](#10-pod-replacement) | Cordon node, delete pod and optionally PVCs | Pod | No | No (exclusive) | +| [Disk Full](#7-disk-full) | Fill volume to trigger real ENOSPC | Pod | Yes | Yes | +| [Disk Failure](#8-disk-failure) | Fail file open syscalls via eBPF | Pod, Node | Yes* | Yes | +| [Container Failure](#9-container-failure) | Kill container processes (SIGTERM/SIGKILL) | Pod | No | No (exclusive) | +| [Node Failure](#10-node-failure) | Kernel panic or power-off a node | Node | No | No (exclusive) | +| [Pod Replacement](#11-pod-replacement) | Cordon node, delete pod and optionally PVCs | Pod | No | No (exclusive) | \* Disk Failure injection is removed when the injector process exits. -**Combination rule:** Network, DNS, gRPC, CPU Pressure, Memory Pressure, Disk Pressure, and Disk Failure can all be applied together in a single Disruption resource. Container Failure, Node Failure, and Pod Replacement are mutually exclusive with every other disruption type. +**Combination rule:** Network, DNS, gRPC, CPU Pressure, Memory Pressure, Disk Pressure, Disk Full, and Disk Failure can all be applied together in a single Disruption resource. Container Failure, Node Failure, and Pod Replacement are mutually exclusive with every other disruption type. --- @@ -738,7 +739,53 @@ spec: --- -## 7. Disk Failure +## 7. Disk Full + +Fills a target pod volume using the `fallocate(2)` syscall, causing real ENOSPC errors on all subsequent write operations. Unlike Disk Pressure (throttles I/O) or Disk Failure (intercepts `openat`), this disruption makes the filesystem genuinely run out of space — visible to `df`, `statfs()`, Kubernetes eviction, and monitoring. + +### Configuration + +| Field | Type | Default | Description | +| ----------- | ------ | ---------- | ------------------------------------------------------------- | +| `path` | string | (required) | Mount path inside the target pod to fill | +| `capacity` | string | — | Fill to this percentage of total capacity (e.g., `"95%"`) | +| `remaining` | string | — | Leave only this much free space (e.g., `"50Mi"`, `"1Gi"`) | + +`capacity` and `remaining` are mutually exclusive — exactly one must be set. + +### Constraints and Limitations + +| Constraint | Detail | +| --- | --- | +| Level | Pod only | +| Safety floor | 1Mi minimum free space enforced by default. Override with `unsafeMode.allowDiskFullNoFloor: true` | +| Ephemeral storage | Filling a volume may trigger kubelet eviction if the pod has `ephemeral-storage` limits | +| Filesystem support | `fallocate(2)` is instant on ext4/xfs. Falls back to writing zeros on NFS/FUSE | + +### Example + +```yaml +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full-test + namespace: my-app +spec: + level: pod + selector: + app: my-service + count: 1 + duration: 10m + diskFull: + path: "/data" + capacity: "95%" +``` + +See [docs/disk_full.md](disk_full.md) for full documentation including manual cleanup instructions. + +--- + +## 8. Disk Failure Uses eBPF to intercept `openat` syscalls and return error codes, simulating file system failures. @@ -791,7 +838,7 @@ spec: --- -## 8. Container Failure +## 9. Container Failure Sends a termination signal to container processes. @@ -832,7 +879,7 @@ spec: --- -## 9. Node Failure +## 10. Node Failure Triggers a kernel panic or power-off by writing to `/proc/sysrq-trigger`. This is **irreversible** — the node becomes unavailable. @@ -892,7 +939,7 @@ spec: --- -## 10. Pod Replacement +## 11. Pod Replacement Simulates complete pod rescheduling: cordons the node, optionally deletes PVCs, deletes the pod, then uncordons. diff --git a/ebpf/const-arm.go b/ebpf/const-arm.go index 10c4318297..78fb005551 100644 --- a/ebpf/const-arm.go +++ b/ebpf/const-arm.go @@ -9,3 +9,4 @@ package ebpf const SysOpenat = "__arm64_sys_openat" +const SysWrite = "__arm64_sys_write" diff --git a/ebpf/const-x64.go b/ebpf/const-x64.go index 3c73bb35b9..0d21b5442b 100644 --- a/ebpf/const-x64.go +++ b/ebpf/const-x64.go @@ -9,3 +9,4 @@ package ebpf const SysOpenat = "__x64_sys_openat" +const SysWrite = "__x64_sys_write" diff --git a/ebpf/disk-full-write/injection.bpf.c b/ebpf/disk-full-write/injection.bpf.c new file mode 100644 index 0000000000..be7b7eaf5f --- /dev/null +++ b/ebpf/disk-full-write/injection.bpf.c @@ -0,0 +1,102 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Intercepts write syscalls (write, pwrite64) and returns a configurable error +// code (default ENOSPC) with configurable probability for a target process and +// its children. Used by the disk-full disruption for eBPF-based write failure +// injection alongside real volume fill. + +// +build ignore +#include "injection.bpf.h" + +const volatile pid_t target_pid = 0; +const volatile pid_t exclude_pid; +const volatile pid_t exit_code = ENOSPC; +const volatile int probability = 100; + +unsigned int hits = 0; +unsigned int disruptedHits = 0; + +struct data_t { + u32 ppid; + u32 pid; + u32 tid; + u32 id; + char comm[100]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(max_entries, 1024); + __type(key, int); + __type(value, u32); +} events SEC(".maps"); + +SEC("kprobe/sys_write") +int injection_disk_full_write(struct pt_regs *ctx) +{ + struct data_t data = {}; + + // Get data of the current process + u32 ppid = 0; + u32 pid = bpf_get_current_pid_tgid(); + if (pid == exclude_pid) { + return 0; + } + u32 tid = bpf_get_current_pid_tgid() >> 32; + u32 gid = bpf_get_current_uid_gid(); + + if (pid != 1) { + // Get parent pid + struct task_struct *task; + struct task_struct *real_parent; + task = (struct task_struct *)bpf_get_current_task(); + bpf_probe_read(&real_parent, sizeof(real_parent), &task->real_parent); + bpf_probe_read(&ppid, sizeof(ppid), &real_parent->tgid); + + // Allow only children and parent process. + if (target_pid != 0 && ppid != target_pid && pid != target_pid) { + return 0; + } + } + + if (ppid == exclude_pid || tid == exclude_pid) { + return 0; + } + + // No path filtering for write syscalls — when a disk is full, ALL writes + // to the filesystem fail with ENOSPC, regardless of the target file. + + if (probability != 100) { + if (hits != 0) { + unsigned long long scaled_disruptedHits = disruptedHits * 100; + unsigned long long scaled_hits = hits; + + if ((scaled_disruptedHits / scaled_hits) > probability) { + hits++; + return 0; + } + } + + hits++; + disruptedHits++; + } + + data.ppid = ppid; + data.pid = pid; + data.tid = tid; + data.id = gid; + + // Get command name + bpf_get_current_comm(&data.comm, sizeof(data.comm)); + + // Add the event to the ring buffer + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &data, 100); + + // Override return of write syscall with error code (default -ENOSPC) + bpf_override_return(ctx, -exit_code); + + return 0; +} diff --git a/ebpf/disk-full-write/injection.bpf.h b/ebpf/disk-full-write/injection.bpf.h new file mode 100644 index 0000000000..b81513d959 --- /dev/null +++ b/ebpf/disk-full-write/injection.bpf.h @@ -0,0 +1,10 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +/* In Linux 5.4 asm_inline was introduced, but it's not supported by clang. + * Redefine it to just asm to enable successful compilation. + * see https://github.com/iovisor/bcc/commit/2d1497cde1cc9835f759a707b42dea83bee378b8 for more details + */ +#include "../includes/bpf_common.h" diff --git a/ebpf/disk-full-write/main.go b/ebpf/disk-full-write/main.go new file mode 100644 index 0000000000..bdcfa78713 --- /dev/null +++ b/ebpf/disk-full-write/main.go @@ -0,0 +1,130 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +//go:build !cgo +// +build !cgo + +package main + +import ( + "C" + "bytes" + "encoding/binary" + "flag" + "os" + "os/signal" + + "github.com/DataDog/chaos-controller/ebpf" + "github.com/DataDog/chaos-controller/log" + bpf "github.com/aquasecurity/libbpfgo" + "github.com/aquasecurity/libbpfgo/helpers" + "go.uber.org/zap" +) + +var nPid = flag.Uint64("process", 0, "Process to disrupt") +var nProbability = flag.Uint64("probability", 100, "Probability to disrupt") +var nExitCode = flag.Uint64("exit-code", 28, "Exit code (default ENOSPC=28)") + +var logger *zap.SugaredLogger + +func main() { + sig := make(chan os.Signal, 1) + signal.Notify(sig, os.Interrupt) + + var err error + logger, err = log.NewZapLogger() + must(err) + + bpf.SetLoggerCbs(bpf.Callbacks{ + Log: func(level int, msg string) { + switch level { + case bpf.LibbpfDebugLevel: + logger.Debug(msg) + case bpf.LibbpfInfoLevel: + logger.Info(msg) + case bpf.LibbpfWarnLevel: + logger.Warn(msg) + default: + logger.Error(msg) + } + }, + }) + + // Create the bpf module + bpfModule, err := bpf.NewModuleFromFile("/usr/local/bin/bpf-disk-full-write.bpf.o") + must(err) + defer bpfModule.Close() + + initGlobalVariables(bpfModule) + + err = bpfModule.BPFLoadObject() + must(err) + + // reads data from the trace pipe that bpf_trace_printk() writes to + go helpers.TracePipeListen() + + // Load the BPF program + prog, err := bpfModule.GetProgram("injection_disk_full_write") + must(err) + + // Attach the kprobe to catch sys_write syscall + _, err = prog.AttachKprobe(ebpf.SysWrite) + must(err) + + // Create the ring buffer to store events + e := make(chan []byte, 300) + p, err := bpfModule.InitPerfBuf("events", e, nil, 1024) + must(err) + + p.Start() + + go func() { + for data := range e { + printEvent(data) + } + }() + + <-sig + p.Stop() +} + +func printEvent(data []byte) { + ppid := int(binary.LittleEndian.Uint32(data[0:4])) + pid := int(binary.LittleEndian.Uint32(data[4:8])) + tid := int(binary.LittleEndian.Uint32(data[8:12])) + gid := int(binary.LittleEndian.Uint32(data[12:16])) + comm := string(bytes.TrimRight(data[16:], "\x00")) + logger.Infof("Disrupt write Ppid %d, Pid %d, Tid: %d, Gid: %d, Command: %s", ppid, pid, tid, gid, comm) +} + +func initGlobalVariables(bpfModule *bpf.Module) { + flag.Parse() + + var pid uint32 = uint32(*nPid) + if err := bpfModule.InitGlobalVariable("target_pid", pid); err != nil { + must(err) + } + + var exitCode uint32 = uint32(*nExitCode) + if err := bpfModule.InitGlobalVariable("exit_code", exitCode); err != nil { + must(err) + } + + var probability uint32 = uint32(*nProbability) + if err := bpfModule.InitGlobalVariable("probability", probability); err != nil { + must(err) + } + + currentPid := uint32(os.Getpid()) + if err := bpfModule.InitGlobalVariable("exclude_pid", currentPid); err != nil { + must(err) + } +} + +func must(err error) { + if err != nil { + panic(err) + } +} diff --git a/fallocate/fallocate_darwin.go b/fallocate/fallocate_darwin.go new file mode 100644 index 0000000000..eb1eb983ef --- /dev/null +++ b/fallocate/fallocate_darwin.go @@ -0,0 +1,40 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Fallocate implementation for Darwin using F_PREALLOCATE fcntl. +// Based on https://github.com/detailyang/go-fallocate (MIT License). + +package fallocate + +import ( + "os" + "syscall" + "unsafe" +) + +// Fallocate pre-allocates disk space for the given file on macOS. +func Fallocate(file *os.File, offset int64, length int64) error { + if length == 0 { + return nil + } + + fst := syscall.Fstore_t{ + Flags: syscall.F_ALLOCATECONTIG, + Posmode: syscall.F_PREALLOCATE, + Offset: 0, + Length: offset + length, + Bytesalloc: 0, + } + + // Try contiguous allocation first, fall back to non-contiguous + // See: https://lists.apple.com/archives/darwin-dev/2007/Dec/msg00040.html + _, _, err := syscall.Syscall(syscall.SYS_FCNTL, file.Fd(), syscall.F_PREALLOCATE, uintptr(unsafe.Pointer(&fst))) + if err != syscall.Errno(0x0) { + fst.Flags = syscall.F_ALLOCATEALL + _, _, _ = syscall.Syscall(syscall.SYS_FCNTL, file.Fd(), syscall.F_PREALLOCATE, uintptr(unsafe.Pointer(&fst))) + } + + return syscall.Ftruncate(int(file.Fd()), fst.Length) +} diff --git a/fallocate/fallocate_linux.go b/fallocate/fallocate_linux.go new file mode 100644 index 0000000000..ab27c7ef8f --- /dev/null +++ b/fallocate/fallocate_linux.go @@ -0,0 +1,37 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Fallocate implementation for Linux using the fallocate(2) syscall. +// Based on https://github.com/detailyang/go-fallocate (MIT License). +// Falls back to writing zeros if the filesystem does not support fallocate. + +package fallocate + +import ( + "errors" + "os" + "syscall" +) + +// Fallocate allocates disk space for the given file without writing data. +// If the filesystem does not support fallocate (EOPNOTSUPP), it falls back +// to writing zeros. +func Fallocate(file *os.File, offset int64, length int64) error { + if length == 0 { + return nil + } + + err := syscall.Fallocate(int(file.Fd()), 0, offset, length) + if err == nil { + return nil + } + + // Fall back to writing zeros on unsupported filesystems (e.g., NFS, some FUSE) + if errors.Is(err, syscall.EOPNOTSUPP) || errors.Is(err, syscall.ENOTSUP) { + return fallocateWrite(file, offset, length) + } + + return err +} diff --git a/fallocate/fallocate_other.go b/fallocate/fallocate_other.go new file mode 100644 index 0000000000..4186b6940c --- /dev/null +++ b/fallocate/fallocate_other.go @@ -0,0 +1,19 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +// Fallocate fallback for platforms that don't support fallocate or F_PREALLOCATE. +// Based on https://github.com/detailyang/go-fallocate (MIT License). + +//go:build !linux && !darwin + +package fallocate + +import "os" + +// Fallocate allocates disk space by writing zeros. This is the fallback +// implementation for platforms without native fallocate support. +func Fallocate(file *os.File, offset int64, length int64) error { + return fallocateWrite(file, offset, length) +} diff --git a/fallocate/write.go b/fallocate/write.go new file mode 100644 index 0000000000..06338a557d --- /dev/null +++ b/fallocate/write.go @@ -0,0 +1,38 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package fallocate + +import ( + "io" + "os" +) + +const writeChunkSize = 65536 + +// fallocateWrite allocates disk space by writing zeros in chunks. +// Used as a fallback when the platform or filesystem doesn't support fallocate. +func fallocateWrite(file *os.File, offset int64, length int64) error { + if _, err := file.Seek(offset, io.SeekStart); err != nil { + return err + } + + var buf [writeChunkSize]byte + + for length > 0 { + n := int64(writeChunkSize) + if length < n { + n = length + } + + if _, err := file.Write(buf[:n]); err != nil { + return err + } + + length -= n + } + + return nil +} diff --git a/injector/disk_full.go b/injector/disk_full.go new file mode 100644 index 0000000000..d537f57ad0 --- /dev/null +++ b/injector/disk_full.go @@ -0,0 +1,326 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package injector + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/DataDog/chaos-controller/api/v1beta1" + "github.com/DataDog/chaos-controller/command" + "github.com/DataDog/chaos-controller/ebpf" + "github.com/DataDog/chaos-controller/env" + "github.com/DataDog/chaos-controller/fallocate" + "github.com/DataDog/chaos-controller/process" + "github.com/DataDog/chaos-controller/types" + "k8s.io/apimachinery/pkg/api/resource" +) + +const ( + // minFreeSpaceBytes is the safety floor enforced unless unsafemode overrides it (1Mi) + minFreeSpaceBytes = 1024 * 1024 + // ballastFilePrefix is the prefix for ballast files created by the disk full injector + ballastFilePrefix = ".chaos-diskfull-" + // EBPFDiskFullWriteCmd is the name of the eBPF binary for write syscall interception + EBPFDiskFullWriteCmd = "bpf-disk-full-write" +) + +type diskFullInjector struct { + spec v1beta1.DiskFullSpec + config DiskFullInjectorConfig + hostPath string + ballastPath string +} + +// DiskFullInjectorConfig is the disk full injector config +type DiskFullInjectorConfig struct { + Config + // CmdFactory is required when WriteSyscall is configured (for launching the eBPF binary) + CmdFactory command.Factory + // ProcessManager is required when WriteSyscall is configured + ProcessManager process.Manager + // BPFConfigInformer is required when WriteSyscall is configured + BPFConfigInformer ebpf.ConfigInformer +} + +// NewDiskFullInjector creates a disk full injector with the given config +func NewDiskFullInjector(spec v1beta1.DiskFullSpec, config DiskFullInjectorConfig) (Injector, error) { + path := spec.Path + + // get root mount path + mountHost, ok := os.LookupEnv(env.InjectorMountHost) + if !ok { + return nil, fmt.Errorf("environment variable %s doesn't exist", env.InjectorMountHost) + } + + // get path from container info if we target a pod + if config.Disruption.Level == types.DisruptionLevelPod { + var err error + + path, err = config.TargetContainer.Runtime().HostPath(config.TargetContainer.ID(), spec.Path) + if err != nil { + return nil, fmt.Errorf("error resolving host path for disk full disruption: %w", err) + } + + if len(path) == 0 { + config.Log.Warnf("could not apply injector on container: %s; %s not found on this targeted container.", config.TargetContainer.Name(), spec.Path) + return nil, nil + } + } + + hostPath := filepath.Clean(mountHost + path) + + // validate path exists + if _, err := os.Stat(hostPath); err != nil { + return nil, fmt.Errorf("target path %s does not exist: %w", hostPath, err) + } + + // initialize eBPF dependencies when writeSyscall is configured + if spec.WriteSyscall != nil { + if config.CmdFactory == nil { + config.CmdFactory = command.NewFactory(config.Disruption.DryRun) + } + + if config.ProcessManager == nil { + config.ProcessManager = process.NewManager(config.Disruption.DryRun) + } + + if config.BPFConfigInformer == nil { + var err error + + config.BPFConfigInformer, err = ebpf.NewConfigInformer(config.Log, config.Disruption.DryRun, nil, nil, nil) + if err != nil { + return nil, fmt.Errorf("could not create an instance of eBPF config informer for the disk full disruption: %w", err) + } + } + } + + ballastPath := filepath.Join(hostPath, ballastFilePrefix+config.Disruption.DisruptionName) + + return &diskFullInjector{ + spec: spec, + config: config, + hostPath: hostPath, + ballastPath: ballastPath, + }, nil +} + +func (i *diskFullInjector) TargetName() string { + return i.config.TargetName() +} + +func (i *diskFullInjector) GetDisruptionKind() types.DisruptionKindName { + return types.DisruptionKindDiskFull +} + +func (i *diskFullInjector) Inject() error { + // Phase 1: Volume fill + if err := i.injectVolumeFill(); err != nil { + return err + } + + // Phase 2: Optional eBPF write syscall interception + if i.spec.WriteSyscall != nil { + if err := i.injectWriteSyscall(); err != nil { + return err + } + } + + return nil +} + +func (i *diskFullInjector) injectVolumeFill() error { + // get filesystem stats + var stat syscall.Statfs_t + if err := syscall.Statfs(i.hostPath, &stat); err != nil { + return fmt.Errorf("error getting filesystem stats for %s: %w", i.hostPath, err) + } + + // Note: on Linux, Blocks/Bavail are in units of Frsize (fragment size), not Bsize. + // On ext4/xfs (the common case), Bsize == Frsize. We use Bsize here for Darwin + // compatibility in tests. The injector runs on Linux where this is correct for + // standard filesystems. + totalBytes := stat.Blocks * uint64(stat.Bsize) + // Bavail excludes space reserved for root (~5% on ext4), so we may slightly + // underestimate bytes to fill. This is the safe direction. + availableBytes := stat.Bavail * uint64(stat.Bsize) + + bytesToFill, err := i.computeBytesToFill(totalBytes, availableBytes) + if err != nil { + return fmt.Errorf("error computing bytes to fill: %w", err) + } + + // enforce 1Mi safety floor + if availableBytes > minFreeSpaceBytes && bytesToFill > availableBytes-minFreeSpaceBytes { + bytesToFill = availableBytes - minFreeSpaceBytes + i.config.Log.Infow("clamped fill size to enforce 1Mi safety floor", + "bytesToFill", bytesToFill, + "availableBytes", availableBytes, + ) + } + + if bytesToFill <= 0 { + i.config.Log.Infow("volume already at or past target fill level, skipping injection", + "totalBytes", totalBytes, + "availableBytes", availableBytes, + ) + + return nil + } + + if i.config.Disruption.DryRun { + i.config.Log.Infow("dry-run: would create ballast file", + "ballastPath", i.ballastPath, + "bytesToFill", bytesToFill, + ) + + return nil + } + + i.config.Log.Infow("injecting disk full disruption", + "path", i.hostPath, + "ballastPath", i.ballastPath, + "bytesToFill", bytesToFill, + "totalBytes", totalBytes, + "availableBytes", availableBytes, + ) + + // Create ballast file and allocate space using fallocate syscall. + // On Linux, this uses fallocate(2) which is instant (metadata-only). + // Falls back to writing zeros if the filesystem doesn't support fallocate. + file, err := os.Create(i.ballastPath) + if err != nil { + return fmt.Errorf("error creating ballast file %s: %w", i.ballastPath, err) + } + + defer func() { + if err := file.Close(); err != nil { + i.config.Log.Warnw("failed to close ballast file", "error", err) + } + }() + + if err := fallocate.Fallocate(file, 0, int64(bytesToFill)); err != nil { + // Clean up partial file on failure + if removeErr := os.Remove(i.ballastPath); removeErr != nil { + i.config.Log.Warnw("failed to clean up partial ballast file", "error", removeErr) + } + + return fmt.Errorf("error allocating disk space: %w", err) + } + + i.config.Log.Infow("disk full disruption injected successfully", + "ballastPath", i.ballastPath, + "bytesToFill", bytesToFill, + ) + + return nil +} + +func (i *diskFullInjector) injectWriteSyscall() error { + if err := i.config.BPFConfigInformer.ValidateRequiredSystemConfig(); err != nil { + return fmt.Errorf("the disk full write syscall interception needs a kernel supporting eBPF programs: %w", err) + } + + if !i.config.BPFConfigInformer.GetMapTypes().HavePerfEventArrayMapType { + return fmt.Errorf("the disk full write syscall interception needs the perf event array map type, but the current kernel does not support this type of map") + } + + pid := 0 + if i.config.Disruption.Level == types.DisruptionLevelPod { + pid = int(i.config.TargetContainer.PID()) + } + + exitCode := i.spec.WriteSyscall.GetExitCodeInt() + + probability := "100" + if i.spec.WriteSyscall.Probability != "" { + probability = strings.TrimSuffix(i.spec.WriteSyscall.Probability, "%") + } + + args := []string{ + "-process", strconv.Itoa(pid), + "-exit-code", strconv.Itoa(exitCode), + "-probability", probability, + } + + i.config.Log.Infow("starting eBPF write syscall interception", + "pid", pid, + "exitCode", i.spec.WriteSyscall.ExitCode, + "probability", probability, + ) + + cmd := i.config.CmdFactory.NewCmd(context.Background(), EBPFDiskFullWriteCmd, args) + + bgCmd := command.NewBackgroundCmd(cmd, i.config.Log, i.config.ProcessManager) + if err := bgCmd.Start(); err != nil { + return fmt.Errorf("unable to run the eBPF disk full write interception: %w", err) + } + + return nil +} + +func (i *diskFullInjector) computeBytesToFill(totalBytes, availableBytes uint64) (uint64, error) { + if i.spec.Capacity != "" { + percentStr := strings.TrimSuffix(i.spec.Capacity, "%") + + percent, err := strconv.Atoi(percentStr) + if err != nil { + return 0, fmt.Errorf("invalid capacity percentage %q: %w", i.spec.Capacity, err) + } + + usedBytes := totalBytes - availableBytes + targetUsed := totalBytes * uint64(percent) / 100 + + if targetUsed <= usedBytes { + return 0, nil + } + + return targetUsed - usedBytes, nil + } + + if i.spec.Remaining != "" { + qty, err := resource.ParseQuantity(i.spec.Remaining) + if err != nil { + return 0, fmt.Errorf("invalid remaining quantity %q: %w", i.spec.Remaining, err) + } + + remainingTarget := uint64(qty.Value()) + if availableBytes <= remainingTarget { + return 0, nil + } + + return availableBytes - remainingTarget, nil + } + + return 0, fmt.Errorf("either capacity or remaining must be set") +} + +func (i *diskFullInjector) UpdateConfig(config Config) { + i.config.Config = config +} + +func (i *diskFullInjector) Clean() error { + i.config.Log.Infow("cleaning disk full disruption", "ballastPath", i.ballastPath) + + if err := os.Remove(i.ballastPath); err != nil { + if errors.Is(err, os.ErrNotExist) { + i.config.Log.Infow("ballast file already removed", "ballastPath", i.ballastPath) + return nil + } + + return fmt.Errorf("error removing ballast file %s: %w", i.ballastPath, err) + } + + i.config.Log.Infow("disk full disruption cleaned successfully", "ballastPath", i.ballastPath) + + return nil +} diff --git a/injector/disk_full_test.go b/injector/disk_full_test.go new file mode 100644 index 0000000000..d6bf149fa3 --- /dev/null +++ b/injector/disk_full_test.go @@ -0,0 +1,222 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package injector_test + +import ( + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + chaosapi "github.com/DataDog/chaos-controller/api" + "github.com/DataDog/chaos-controller/api/v1beta1" + "github.com/DataDog/chaos-controller/env" + . "github.com/DataDog/chaos-controller/injector" + "github.com/DataDog/chaos-controller/types" +) + +var _ = Describe("DiskFull", func() { + var ( + config DiskFullInjectorConfig + inj Injector + spec v1beta1.DiskFullSpec + tmpDir string + ) + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "chaos-diskfull-test-*") + Expect(err).ToNot(HaveOccurred()) + + // env vars — set mount host to empty so hostPath = tmpDir directly + os.Setenv(env.InjectorMountHost, "") + + // config — node level to avoid needing container runtime mock + config = DiskFullInjectorConfig{ + Config: Config{ + Log: log, + MetricsSink: ms, + Disruption: chaosapi.DisruptionArgs{ + Level: types.DisruptionLevelNode, + DisruptionName: "test-disruption", + }, + }, + } + + spec = v1beta1.DiskFullSpec{ + Path: tmpDir, + Capacity: "95%", + } + }) + + AfterEach(func() { + os.Unsetenv(env.InjectorMountHost) + os.RemoveAll(tmpDir) + }) + + Describe("NewDiskFullInjector", func() { + It("should create an injector successfully", func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(inj).ToNot(BeNil()) + }) + + It("should return an error when the path does not exist", func() { + spec.Path = "/nonexistent/path/that/does/not/exist" + inj, err := NewDiskFullInjector(spec, config) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("does not exist")) + Expect(inj).To(BeNil()) + }) + + It("should return an error when mount host env var is not set", func() { + os.Unsetenv(env.InjectorMountHost) + inj, err := NewDiskFullInjector(spec, config) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(env.InjectorMountHost)) + Expect(inj).To(BeNil()) + }) + + It("should return the correct disruption kind", func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(string(inj.GetDisruptionKind())).To(Equal("disk-full")) + }) + }) + + Describe("Inject", func() { + JustBeforeEach(func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(inj).ToNot(BeNil()) + }) + + Context("with capacity percentage", func() { + BeforeEach(func() { + spec.Capacity = "95%" + spec.Remaining = "" + }) + + It("should create a ballast file", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + info, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + Expect(info.Size()).To(BeNumerically(">", 0)) + }) + }) + + Context("with remaining quantity", func() { + BeforeEach(func() { + spec.Capacity = "" + spec.Remaining = "50Mi" + }) + + It("should create a ballast file", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + info, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + Expect(info.Size()).To(BeNumerically(">", 0)) + }) + }) + + Context("with dry-run mode", func() { + BeforeEach(func() { + config.Disruption.DryRun = true + }) + + It("should not create a ballast file", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + + Context("with remaining larger than available space", func() { + BeforeEach(func() { + spec.Capacity = "" + spec.Remaining = "999Ti" + }) + + It("should skip injection without error", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + }) + + Describe("Inject and Clean round trip", func() { + It("should create and then remove the ballast file", func() { + spec.Capacity = "" + spec.Remaining = "0" + + inj, err := NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + + err = inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + + err = inj.Clean() + Expect(err).ToNot(HaveOccurred()) + + _, statErr = os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + + Describe("Clean", func() { + JustBeforeEach(func() { + var err error + inj, err = NewDiskFullInjector(spec, config) + Expect(err).ToNot(HaveOccurred()) + Expect(inj).ToNot(BeNil()) + }) + + Context("when ballast file exists", func() { + BeforeEach(func() { + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + err := os.WriteFile(ballastPath, []byte("ballast"), 0644) + Expect(err).ToNot(HaveOccurred()) + }) + + It("should remove the ballast file", func() { + err := inj.Clean() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + _, statErr := os.Stat(ballastPath) + Expect(os.IsNotExist(statErr)).To(BeTrue()) + }) + }) + + Context("when ballast file does not exist", func() { + It("should succeed without error (idempotent)", func() { + err := inj.Clean() + Expect(err).ToNot(HaveOccurred()) + }) + }) + }) +}) diff --git a/safemode/safemode.go b/safemode/safemode.go index 90787d2c2e..a4649fc877 100644 --- a/safemode/safemode.go +++ b/safemode/safemode.go @@ -40,6 +40,12 @@ func AddAllSafemodeObjects(disruption v1beta1.Disruption, k8sClient client.Clien safemodeList = append(safemodeList, &safemodeDiskPressure) } + if disruption.Spec.DiskFull != nil { + safemodeDiskFull := DiskFull{} + safemodeDiskFull.Init(disruption, k8sClient) + safemodeList = append(safemodeList, &safemodeDiskFull) + } + if disruption.Spec.DiskFailure != nil { safemodeDiskFailure := DiskFailure{} safemodeDiskFailure.Init(disruption, k8sClient) diff --git a/safemode/safemode_disk_full.go b/safemode/safemode_disk_full.go new file mode 100644 index 0000000000..d605938124 --- /dev/null +++ b/safemode/safemode_disk_full.go @@ -0,0 +1,22 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026 Datadog, Inc. + +package safemode + +import ( + "github.com/DataDog/chaos-controller/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type DiskFull struct { + dis v1beta1.Disruption + client client.Client +} + +// Init Refer to safemode.Safemode interface for documentation +func (sm *DiskFull) Init(disruption v1beta1.Disruption, client client.Client) { + sm.dis = disruption + sm.client = client +} diff --git a/types/types.go b/types/types.go index 06d1be1038..d6097c6a18 100644 --- a/types/types.go +++ b/types/types.go @@ -75,6 +75,8 @@ const ( DisruptionKindCPUStress = "cpu-pressure-stress" // DisruptionKindDiskFailure is a disk failure disruption DisruptionKindDiskFailure = "disk-failure" + // DisruptionKindDiskFull is a disk full (ENOSPC) disruption + DisruptionKindDiskFull = "disk-full" // DisruptionKindDiskPressure is a disk pressure disruption DisruptionKindDiskPressure = "disk-pressure" // DisruptionKindGRPCDisruption is a grpc disruption @@ -161,6 +163,7 @@ var DisruptionKindNames = []DisruptionKindName{ DisruptionKindCPUPressure, DisruptionKindMemoryPressure, DisruptionKindDiskPressure, + DisruptionKindDiskFull, DisruptionKindDiskFailure, DisruptionKindGRPCDisruption, DisruptionKindDNSDisruption, From 499ca662482f892bf9b9a91621976c57655d1b38 Mon Sep 17 00:00:00 2001 From: Thibault NORMAND Date: Wed, 8 Apr 2026 17:33:08 +0200 Subject: [PATCH 2/4] test(disruption): add safet net to prevent to fill the CI runner. --- injector/disk_full_test.go | 64 +++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/injector/disk_full_test.go b/injector/disk_full_test.go index d6bf149fa3..478f781e7f 100644 --- a/injector/disk_full_test.go +++ b/injector/disk_full_test.go @@ -6,8 +6,10 @@ package injector_test import ( + "fmt" "os" "path/filepath" + "syscall" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -98,27 +100,19 @@ var _ = Describe("DiskFull", func() { Expect(inj).ToNot(BeNil()) }) - Context("with capacity percentage", func() { + Context("with a small allocation that fits in available space", func() { BeforeEach(func() { - spec.Capacity = "95%" - spec.Remaining = "" - }) - - It("should create a ballast file", func() { - err := inj.Inject() + // Compute a remaining value that will only allocate 1Mi. + // remaining = available - 1Mi, so bytesToFill = available - remaining = 1Mi. + var stat syscall.Statfs_t + err := syscall.Statfs(tmpDir, &stat) Expect(err).ToNot(HaveOccurred()) - ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") - info, statErr := os.Stat(ballastPath) - Expect(statErr).ToNot(HaveOccurred()) - Expect(info.Size()).To(BeNumerically(">", 0)) - }) - }) - - Context("with remaining quantity", func() { - BeforeEach(func() { + availableBytes := stat.Bavail * uint64(stat.Bsize) + // Leave (available - 2Mi) as remaining, so we allocate ~2Mi minus safety floor = ~1Mi + targetRemaining := availableBytes - 2*1024*1024 spec.Capacity = "" - spec.Remaining = "50Mi" + spec.Remaining = formatBytes(targetRemaining) }) It("should create a ballast file", func() { @@ -132,12 +126,13 @@ var _ = Describe("DiskFull", func() { }) }) - Context("with dry-run mode", func() { + Context("with remaining larger than available space", func() { BeforeEach(func() { - config.Disruption.DryRun = true + spec.Capacity = "" + spec.Remaining = "999Ti" }) - It("should not create a ballast file", func() { + It("should skip injection without error", func() { err := inj.Inject() Expect(err).ToNot(HaveOccurred()) @@ -147,13 +142,12 @@ var _ = Describe("DiskFull", func() { }) }) - Context("with remaining larger than available space", func() { + Context("with dry-run mode", func() { BeforeEach(func() { - spec.Capacity = "" - spec.Remaining = "999Ti" + config.Disruption.DryRun = true }) - It("should skip injection without error", func() { + It("should not create a ballast file", func() { err := inj.Inject() Expect(err).ToNot(HaveOccurred()) @@ -166,8 +160,15 @@ var _ = Describe("DiskFull", func() { Describe("Inject and Clean round trip", func() { It("should create and then remove the ballast file", func() { + // Compute a remaining value that allocates only ~1Mi + var stat syscall.Statfs_t + err := syscall.Statfs(tmpDir, &stat) + Expect(err).ToNot(HaveOccurred()) + + availableBytes := stat.Bavail * uint64(stat.Bsize) + targetRemaining := availableBytes - 2*1024*1024 spec.Capacity = "" - spec.Remaining = "0" + spec.Remaining = formatBytes(targetRemaining) inj, err := NewDiskFullInjector(spec, config) Expect(err).ToNot(HaveOccurred()) @@ -220,3 +221,16 @@ var _ = Describe("DiskFull", func() { }) }) }) + +// formatBytes formats a byte count as a string suitable for resource.ParseQuantity +func formatBytes(bytes uint64) string { + if bytes >= 1024*1024*1024 { + return fmt.Sprintf("%dGi", bytes/(1024*1024*1024)) + } + + if bytes >= 1024*1024 { + return fmt.Sprintf("%dMi", bytes/(1024*1024)) + } + + return fmt.Sprintf("%d", bytes) +} From a8c20060dd8743deac5f44423e483122d549f0bc Mon Sep 17 00:00:00 2001 From: Thibault NORMAND Date: Thu, 9 Apr 2026 14:47:59 +0200 Subject: [PATCH 3/4] feat(disk-full): remove epbf interception. --- api/v1beta1/disk_full.go | 96 ------------- api/v1beta1/disk_full_test.go | 118 ---------------- api/v1beta1/zz_generated.deepcopy.go | 22 +-- .../chaos.datadoghq.com_disruptioncrons.yaml | 22 --- ...haos.datadoghq.com_disruptionrollouts.yaml | 22 --- .../chaos.datadoghq.com_disruptions.yaml | 22 --- cli/injector/disk_full.go | 11 -- docs/disk_full.md | 34 ----- ebpf/disk-full-write/injection.bpf.c | 102 -------------- ebpf/disk-full-write/injection.bpf.h | 10 -- ebpf/disk-full-write/main.go | 130 ------------------ injector/disk_full.go | 89 +----------- 12 files changed, 2 insertions(+), 676 deletions(-) delete mode 100644 ebpf/disk-full-write/injection.bpf.c delete mode 100644 ebpf/disk-full-write/injection.bpf.h delete mode 100644 ebpf/disk-full-write/main.go diff --git a/api/v1beta1/disk_full.go b/api/v1beta1/disk_full.go index 13477df979..b257627793 100644 --- a/api/v1beta1/disk_full.go +++ b/api/v1beta1/disk_full.go @@ -27,43 +27,6 @@ type DiskFullSpec struct { // Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). // Mutually exclusive with Capacity. Remaining string `json:"remaining,omitempty"` - // WriteSyscall optionally enables eBPF-based write syscall interception to return errors - // with configurable probability. This runs in addition to the volume fill. - // +nullable - WriteSyscall *WriteSyscallSpec `json:"writeSyscall,omitempty"` -} - -// WriteSyscallSpec configures eBPF-based interception of write syscalls (write, pwrite64) -// to return a configurable error code with a given probability. -type WriteSyscallSpec struct { - // ExitCode is the errno to return on intercepted write syscalls. - // +kubebuilder:validation:Enum=ENOSPC;EDQUOT;EIO;EROFS;EFBIG;EPERM;EACCES - // +kubebuilder:default=ENOSPC - ExitCode string `json:"exitCode,omitempty" chaos_validate:"omitempty,oneofci=ENOSPC EDQUOT EIO EROFS EFBIG EPERM EACCES"` - // Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%". - Probability string `json:"probability,omitempty"` -} - -// GetExitCodeInt returns the integer value of the configured errno. -func (s *WriteSyscallSpec) GetExitCodeInt() int { - switch s.ExitCode { - case "ENOSPC": - return 28 - case "EDQUOT": - return 122 - case "EIO": - return 5 - case "EROFS": - return 30 - case "EFBIG": - return 27 - case "EPERM": - return 1 - case "EACCES": - return 13 - default: - return 28 // ENOSPC - } } // Validate validates args for the given disruption @@ -95,12 +58,6 @@ func (s *DiskFullSpec) Validate() (retErr error) { } } - if s.WriteSyscall != nil { - if err := validateWriteSyscallProbability(s.WriteSyscall.Probability); err != nil { - retErr = multierror.Append(retErr, err) - } - } - return retErr } @@ -136,29 +93,6 @@ func validateRemaining(remaining string) error { return nil } -func validateWriteSyscallProbability(probability string) error { - if probability == "" { - return nil - } - - if !strings.HasSuffix(probability, "%") { - return fmt.Errorf("writeSyscall probability must be a percentage suffixed with %%, got %q", probability) - } - - valueStr := strings.TrimSuffix(probability, "%") - - value, err := strconv.Atoi(valueStr) - if err != nil { - return fmt.Errorf("writeSyscall probability must be an integer, got %q: %w", valueStr, err) - } - - if value < 1 || value > 100 { - return fmt.Errorf("writeSyscall probability must be between 1 and 100, got %d", value) - } - - return nil -} - // GenerateArgs generates injection or cleanup pod arguments for the given spec func (s *DiskFullSpec) GenerateArgs() []string { args := []string{ @@ -175,22 +109,6 @@ func (s *DiskFullSpec) GenerateArgs() []string { args = append(args, "--remaining", s.Remaining) } - if s.WriteSyscall != nil { - exitCode := s.WriteSyscall.ExitCode - if exitCode == "" { - exitCode = "ENOSPC" - } - - args = append(args, "--write-exit-code", exitCode) - - probability := s.WriteSyscall.Probability - if probability == "" { - probability = "100%" - } - - args = append(args, "--write-probability", probability) - } - return args } @@ -208,19 +126,5 @@ func (s *DiskFullSpec) Explain() []string { explanation += ", causing ENOSPC errors on subsequent write operations." - if s.WriteSyscall != nil { - exitCode := s.WriteSyscall.ExitCode - if exitCode == "" { - exitCode = "ENOSPC" - } - - probability := s.WriteSyscall.Probability - if probability == "" { - probability = "100%" - } - - explanation += fmt.Sprintf(" Additionally, write syscalls will be intercepted via eBPF and return %s %s of the time.", exitCode, probability) - } - return []string{"", explanation} } diff --git a/api/v1beta1/disk_full_test.go b/api/v1beta1/disk_full_test.go index 6aac922c33..0dce828ded 100644 --- a/api/v1beta1/disk_full_test.go +++ b/api/v1beta1/disk_full_test.go @@ -53,33 +53,6 @@ var _ = Describe("DiskFullSpec", func() { Remaining: "0", }, ), - Entry("with writeSyscall defaults", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{}, - }, - ), - Entry("with writeSyscall explicit values", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{ - ExitCode: "ENOSPC", - Probability: "50%", - }, - }, - ), - Entry("with writeSyscall EDQUOT exit code", - DiskFullSpec{ - Path: "/data", - Capacity: "80%", - WriteSyscall: &WriteSyscallSpec{ - ExitCode: "EDQUOT", - Probability: "100%", - }, - }, - ), ) DescribeTable("error cases", @@ -169,46 +142,6 @@ var _ = Describe("DiskFullSpec", func() { "one of capacity or remaining must be set", }, ), - Entry("with writeSyscall invalid probability (no suffix)", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{ - Probability: "50", - }, - }, - []string{"writeSyscall probability must be a percentage suffixed with %"}, - ), - Entry("with writeSyscall probability 0%", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{ - Probability: "0%", - }, - }, - []string{"writeSyscall probability must be between 1 and 100"}, - ), - Entry("with writeSyscall probability 101%", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{ - Probability: "101%", - }, - }, - []string{"writeSyscall probability must be between 1 and 100"}, - ), - Entry("with writeSyscall non-numeric probability", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{ - Probability: "abc%", - }, - }, - []string{"writeSyscall probability must be an integer"}, - ), ) }) @@ -233,25 +166,6 @@ var _ = Describe("DiskFullSpec", func() { }, []string{"--path", "/data", "--remaining", "50Mi"}, ), - Entry("with writeSyscall defaults", - DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{}, - }, - []string{"--path", "/data", "--capacity", "95%", "--write-exit-code", "ENOSPC", "--write-probability", "100%"}, - ), - Entry("with writeSyscall explicit values", - DiskFullSpec{ - Path: "/data", - Capacity: "90%", - WriteSyscall: &WriteSyscallSpec{ - ExitCode: "EDQUOT", - Probability: "50%", - }, - }, - []string{"--path", "/data", "--capacity", "90%", "--write-exit-code", "EDQUOT", "--write-probability", "50%"}, - ), ) }) @@ -280,37 +194,5 @@ var _ = Describe("DiskFullSpec", func() { Expect(explanation[1]).To(ContainSubstring("ENOSPC")) }) - It("explains writeSyscall mode", func() { - spec := DiskFullSpec{ - Path: "/data", - Capacity: "95%", - WriteSyscall: &WriteSyscallSpec{ - ExitCode: "ENOSPC", - Probability: "50%", - }, - } - explanation := spec.Explain() - Expect(explanation).To(HaveLen(2)) - Expect(explanation[1]).To(ContainSubstring("eBPF")) - Expect(explanation[1]).To(ContainSubstring("ENOSPC")) - Expect(explanation[1]).To(ContainSubstring("50%")) - }) - }) - - Describe("WriteSyscallSpec", func() { - DescribeTable("GetExitCodeInt", - func(exitCode string, expected int) { - spec := WriteSyscallSpec{ExitCode: exitCode} - Expect(spec.GetExitCodeInt()).To(Equal(expected)) - }, - Entry("ENOSPC", "ENOSPC", 28), - Entry("EDQUOT", "EDQUOT", 122), - Entry("EIO", "EIO", 5), - Entry("EROFS", "EROFS", 30), - Entry("EFBIG", "EFBIG", 27), - Entry("EPERM", "EPERM", 1), - Entry("EACCES", "EACCES", 13), - Entry("empty defaults to ENOSPC", "", 28), - ) }) }) diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 6665c482b8..37a0a8e9f2 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -186,11 +186,6 @@ func (in *DiskFailureSpec) DeepCopy() *DiskFailureSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DiskFullSpec) DeepCopyInto(out *DiskFullSpec) { *out = *in - if in.WriteSyscall != nil { - in, out := &in.WriteSyscall, &out.WriteSyscall - *out = new(WriteSyscallSpec) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskFullSpec. @@ -669,7 +664,7 @@ func (in *DisruptionSpec) DeepCopyInto(out *DisruptionSpec) { if in.DiskFull != nil { in, out := &in.DiskFull, &out.DiskFull *out = new(DiskFullSpec) - (*in).DeepCopyInto(*out) + **out = **in } if in.DiskFailure != nil { in, out := &in.DiskFailure, &out.DiskFailure @@ -1237,18 +1232,3 @@ func (in *UnsafemodeSpec) DeepCopy() *UnsafemodeSpec { in.DeepCopyInto(out) return out } - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *WriteSyscallSpec) DeepCopyInto(out *WriteSyscallSpec) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WriteSyscallSpec. -func (in *WriteSyscallSpec) DeepCopy() *WriteSyscallSpec { - if in == nil { - return nil - } - out := new(WriteSyscallSpec) - in.DeepCopyInto(out) - return out -} diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml index 19799d71c7..a0efe99bbe 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptioncrons.yaml @@ -181,28 +181,6 @@ spec: Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). Mutually exclusive with Capacity. type: string - writeSyscall: - description: |- - WriteSyscall optionally enables eBPF-based write syscall interception to return errors - with configurable probability. This runs in addition to the volume fill. - nullable: true - properties: - exitCode: - default: ENOSPC - description: ExitCode is the errno to return on intercepted write syscalls. - enum: - - ENOSPC - - EDQUOT - - EIO - - EROFS - - EFBIG - - EPERM - - EACCES - type: string - probability: - description: 'Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%".' - type: string - type: object required: - path type: object diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml index cc530f81e0..8aa1a3405c 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptionrollouts.yaml @@ -182,28 +182,6 @@ spec: Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). Mutually exclusive with Capacity. type: string - writeSyscall: - description: |- - WriteSyscall optionally enables eBPF-based write syscall interception to return errors - with configurable probability. This runs in addition to the volume fill. - nullable: true - properties: - exitCode: - default: ENOSPC - description: ExitCode is the errno to return on intercepted write syscalls. - enum: - - ENOSPC - - EDQUOT - - EIO - - EROFS - - EFBIG - - EPERM - - EACCES - type: string - probability: - description: 'Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%".' - type: string - type: object required: - path type: object diff --git a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml index 0f6ba0051b..4fdb65fef3 100644 --- a/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml +++ b/chart/templates/generated/chaos.datadoghq.com_disruptions.yaml @@ -172,28 +172,6 @@ spec: Remaining is the amount of free space to leave on the volume (e.g., "50Mi", "1Gi"). Mutually exclusive with Capacity. type: string - writeSyscall: - description: |- - WriteSyscall optionally enables eBPF-based write syscall interception to return errors - with configurable probability. This runs in addition to the volume fill. - nullable: true - properties: - exitCode: - default: ENOSPC - description: ExitCode is the errno to return on intercepted write syscalls. - enum: - - ENOSPC - - EDQUOT - - EIO - - EROFS - - EFBIG - - EPERM - - EACCES - type: string - probability: - description: 'Probability is the percentage of write syscalls to fail (e.g., "50%"). Default: "100%".' - type: string - type: object required: - path type: object diff --git a/cli/injector/disk_full.go b/cli/injector/disk_full.go index 3f82ec9939..0370c615cf 100644 --- a/cli/injector/disk_full.go +++ b/cli/injector/disk_full.go @@ -25,8 +25,6 @@ var diskFullCmd = &cobra.Command{ path, _ := cmd.Flags().GetString("path") capacity, _ := cmd.Flags().GetString("capacity") remaining, _ := cmd.Flags().GetString("remaining") - writeExitCode, _ := cmd.Flags().GetString("write-exit-code") - writeProbability, _ := cmd.Flags().GetString("write-probability") spec := v1beta1.DiskFullSpec{ Path: path, @@ -34,13 +32,6 @@ var diskFullCmd = &cobra.Command{ Remaining: remaining, } - if writeExitCode != "" || writeProbability != "" { - spec.WriteSyscall = &v1beta1.WriteSyscallSpec{ - ExitCode: writeExitCode, - Probability: writeProbability, - } - } - for _, config := range configs { inj, err := injector.NewDiskFullInjector(spec, injector.DiskFullInjectorConfig{Config: config}) if err != nil { @@ -67,8 +58,6 @@ func init() { diskFullCmd.Flags().String("path", "", "Path to apply disk full disruption to") diskFullCmd.Flags().String("capacity", "", "Target fill percentage of total volume capacity (e.g., 95%)") diskFullCmd.Flags().String("remaining", "", "Amount of free space to leave on the volume (e.g., 50Mi)") - diskFullCmd.Flags().String("write-exit-code", "", "Error code to return on intercepted write syscalls (e.g., ENOSPC)") - diskFullCmd.Flags().String("write-probability", "", "Percentage of write syscalls to fail (e.g., 50%)") _ = cobra.MarkFlagRequired(diskFullCmd.PersistentFlags(), "path") } diff --git a/docs/disk_full.md b/docs/disk_full.md index a0ace4f7b6..42cff0cbab 100644 --- a/docs/disk_full.md +++ b/docs/disk_full.md @@ -15,21 +15,9 @@ When the disruption is cleaned up, the ballast file is removed and space is free | `path` | string | Yes | Mount path inside the target pod to fill (e.g., `/data`, `/var/log`) | | `capacity` | string | One of | Fill to this percentage of total volume capacity (e.g., `"95%"`) | | `remaining` | string | One of | Leave only this much free space on the volume (e.g., `"50Mi"`, `"1Gi"`) | -| `writeSyscall` | object | No | Optional eBPF-based write syscall interception (see below) | `capacity` and `remaining` are **mutually exclusive** — exactly one must be set. -### writeSyscall (optional) - -When set, an eBPF program is launched alongside the volume fill to intercept `write` syscalls and return errors with configurable probability. This is useful for testing partial write failures or for environments where the volume fill alone isn't sufficient. - -| Field | Type | Default | Description | -|---------------|--------|-----------|-------------| -| `exitCode` | string | `ENOSPC` | errno to return: `ENOSPC`, `EDQUOT`, `EIO`, `EROFS`, `EFBIG`, `EPERM`, `EACCES` | -| `probability` | string | `"100%"` | Percentage of write syscalls to fail (1-100%) | - -**Requirements:** The kernel must support eBPF with `CONFIG_BPF_KPROBE_OVERRIDE` enabled. - ## Examples ### Fill to 95% capacity @@ -70,28 +58,6 @@ spec: remaining: "10Mi" ``` -### Volume fill + eBPF write interception - -```yaml -apiVersion: chaos.datadoghq.com/v1beta1 -kind: Disruption -metadata: - name: disk-full-with-ebpf - namespace: my-app -spec: - level: pod - selector: - app: my-service - count: 1 - duration: 10m - diskFull: - path: "/data" - capacity: "90%" - writeSyscall: - exitCode: ENOSPC - probability: "50%" -``` - ### Fill to 100% (requires unsafeMode) By default, the controller enforces a 1Mi minimum free space safety floor to prevent filesystem journal corruption. To fill completely: diff --git a/ebpf/disk-full-write/injection.bpf.c b/ebpf/disk-full-write/injection.bpf.c deleted file mode 100644 index be7b7eaf5f..0000000000 --- a/ebpf/disk-full-write/injection.bpf.c +++ /dev/null @@ -1,102 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026 Datadog, Inc. - -// Intercepts write syscalls (write, pwrite64) and returns a configurable error -// code (default ENOSPC) with configurable probability for a target process and -// its children. Used by the disk-full disruption for eBPF-based write failure -// injection alongside real volume fill. - -// +build ignore -#include "injection.bpf.h" - -const volatile pid_t target_pid = 0; -const volatile pid_t exclude_pid; -const volatile pid_t exit_code = ENOSPC; -const volatile int probability = 100; - -unsigned int hits = 0; -unsigned int disruptedHits = 0; - -struct data_t { - u32 ppid; - u32 pid; - u32 tid; - u32 id; - char comm[100]; -}; - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(max_entries, 1024); - __type(key, int); - __type(value, u32); -} events SEC(".maps"); - -SEC("kprobe/sys_write") -int injection_disk_full_write(struct pt_regs *ctx) -{ - struct data_t data = {}; - - // Get data of the current process - u32 ppid = 0; - u32 pid = bpf_get_current_pid_tgid(); - if (pid == exclude_pid) { - return 0; - } - u32 tid = bpf_get_current_pid_tgid() >> 32; - u32 gid = bpf_get_current_uid_gid(); - - if (pid != 1) { - // Get parent pid - struct task_struct *task; - struct task_struct *real_parent; - task = (struct task_struct *)bpf_get_current_task(); - bpf_probe_read(&real_parent, sizeof(real_parent), &task->real_parent); - bpf_probe_read(&ppid, sizeof(ppid), &real_parent->tgid); - - // Allow only children and parent process. - if (target_pid != 0 && ppid != target_pid && pid != target_pid) { - return 0; - } - } - - if (ppid == exclude_pid || tid == exclude_pid) { - return 0; - } - - // No path filtering for write syscalls — when a disk is full, ALL writes - // to the filesystem fail with ENOSPC, regardless of the target file. - - if (probability != 100) { - if (hits != 0) { - unsigned long long scaled_disruptedHits = disruptedHits * 100; - unsigned long long scaled_hits = hits; - - if ((scaled_disruptedHits / scaled_hits) > probability) { - hits++; - return 0; - } - } - - hits++; - disruptedHits++; - } - - data.ppid = ppid; - data.pid = pid; - data.tid = tid; - data.id = gid; - - // Get command name - bpf_get_current_comm(&data.comm, sizeof(data.comm)); - - // Add the event to the ring buffer - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &data, 100); - - // Override return of write syscall with error code (default -ENOSPC) - bpf_override_return(ctx, -exit_code); - - return 0; -} diff --git a/ebpf/disk-full-write/injection.bpf.h b/ebpf/disk-full-write/injection.bpf.h deleted file mode 100644 index b81513d959..0000000000 --- a/ebpf/disk-full-write/injection.bpf.h +++ /dev/null @@ -1,10 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026 Datadog, Inc. - -/* In Linux 5.4 asm_inline was introduced, but it's not supported by clang. - * Redefine it to just asm to enable successful compilation. - * see https://github.com/iovisor/bcc/commit/2d1497cde1cc9835f759a707b42dea83bee378b8 for more details - */ -#include "../includes/bpf_common.h" diff --git a/ebpf/disk-full-write/main.go b/ebpf/disk-full-write/main.go deleted file mode 100644 index bdcfa78713..0000000000 --- a/ebpf/disk-full-write/main.go +++ /dev/null @@ -1,130 +0,0 @@ -// Unless explicitly stated otherwise all files in this repository are licensed -// under the Apache License Version 2.0. -// This product includes software developed at Datadog (https://www.datadoghq.com/). -// Copyright 2026 Datadog, Inc. - -//go:build !cgo -// +build !cgo - -package main - -import ( - "C" - "bytes" - "encoding/binary" - "flag" - "os" - "os/signal" - - "github.com/DataDog/chaos-controller/ebpf" - "github.com/DataDog/chaos-controller/log" - bpf "github.com/aquasecurity/libbpfgo" - "github.com/aquasecurity/libbpfgo/helpers" - "go.uber.org/zap" -) - -var nPid = flag.Uint64("process", 0, "Process to disrupt") -var nProbability = flag.Uint64("probability", 100, "Probability to disrupt") -var nExitCode = flag.Uint64("exit-code", 28, "Exit code (default ENOSPC=28)") - -var logger *zap.SugaredLogger - -func main() { - sig := make(chan os.Signal, 1) - signal.Notify(sig, os.Interrupt) - - var err error - logger, err = log.NewZapLogger() - must(err) - - bpf.SetLoggerCbs(bpf.Callbacks{ - Log: func(level int, msg string) { - switch level { - case bpf.LibbpfDebugLevel: - logger.Debug(msg) - case bpf.LibbpfInfoLevel: - logger.Info(msg) - case bpf.LibbpfWarnLevel: - logger.Warn(msg) - default: - logger.Error(msg) - } - }, - }) - - // Create the bpf module - bpfModule, err := bpf.NewModuleFromFile("/usr/local/bin/bpf-disk-full-write.bpf.o") - must(err) - defer bpfModule.Close() - - initGlobalVariables(bpfModule) - - err = bpfModule.BPFLoadObject() - must(err) - - // reads data from the trace pipe that bpf_trace_printk() writes to - go helpers.TracePipeListen() - - // Load the BPF program - prog, err := bpfModule.GetProgram("injection_disk_full_write") - must(err) - - // Attach the kprobe to catch sys_write syscall - _, err = prog.AttachKprobe(ebpf.SysWrite) - must(err) - - // Create the ring buffer to store events - e := make(chan []byte, 300) - p, err := bpfModule.InitPerfBuf("events", e, nil, 1024) - must(err) - - p.Start() - - go func() { - for data := range e { - printEvent(data) - } - }() - - <-sig - p.Stop() -} - -func printEvent(data []byte) { - ppid := int(binary.LittleEndian.Uint32(data[0:4])) - pid := int(binary.LittleEndian.Uint32(data[4:8])) - tid := int(binary.LittleEndian.Uint32(data[8:12])) - gid := int(binary.LittleEndian.Uint32(data[12:16])) - comm := string(bytes.TrimRight(data[16:], "\x00")) - logger.Infof("Disrupt write Ppid %d, Pid %d, Tid: %d, Gid: %d, Command: %s", ppid, pid, tid, gid, comm) -} - -func initGlobalVariables(bpfModule *bpf.Module) { - flag.Parse() - - var pid uint32 = uint32(*nPid) - if err := bpfModule.InitGlobalVariable("target_pid", pid); err != nil { - must(err) - } - - var exitCode uint32 = uint32(*nExitCode) - if err := bpfModule.InitGlobalVariable("exit_code", exitCode); err != nil { - must(err) - } - - var probability uint32 = uint32(*nProbability) - if err := bpfModule.InitGlobalVariable("probability", probability); err != nil { - must(err) - } - - currentPid := uint32(os.Getpid()) - if err := bpfModule.InitGlobalVariable("exclude_pid", currentPid); err != nil { - must(err) - } -} - -func must(err error) { - if err != nil { - panic(err) - } -} diff --git a/injector/disk_full.go b/injector/disk_full.go index d537f57ad0..93a6ecfada 100644 --- a/injector/disk_full.go +++ b/injector/disk_full.go @@ -6,7 +6,6 @@ package injector import ( - "context" "errors" "fmt" "os" @@ -16,11 +15,8 @@ import ( "syscall" "github.com/DataDog/chaos-controller/api/v1beta1" - "github.com/DataDog/chaos-controller/command" - "github.com/DataDog/chaos-controller/ebpf" "github.com/DataDog/chaos-controller/env" "github.com/DataDog/chaos-controller/fallocate" - "github.com/DataDog/chaos-controller/process" "github.com/DataDog/chaos-controller/types" "k8s.io/apimachinery/pkg/api/resource" ) @@ -30,8 +26,6 @@ const ( minFreeSpaceBytes = 1024 * 1024 // ballastFilePrefix is the prefix for ballast files created by the disk full injector ballastFilePrefix = ".chaos-diskfull-" - // EBPFDiskFullWriteCmd is the name of the eBPF binary for write syscall interception - EBPFDiskFullWriteCmd = "bpf-disk-full-write" ) type diskFullInjector struct { @@ -44,12 +38,6 @@ type diskFullInjector struct { // DiskFullInjectorConfig is the disk full injector config type DiskFullInjectorConfig struct { Config - // CmdFactory is required when WriteSyscall is configured (for launching the eBPF binary) - CmdFactory command.Factory - // ProcessManager is required when WriteSyscall is configured - ProcessManager process.Manager - // BPFConfigInformer is required when WriteSyscall is configured - BPFConfigInformer ebpf.ConfigInformer } // NewDiskFullInjector creates a disk full injector with the given config @@ -84,26 +72,6 @@ func NewDiskFullInjector(spec v1beta1.DiskFullSpec, config DiskFullInjectorConfi return nil, fmt.Errorf("target path %s does not exist: %w", hostPath, err) } - // initialize eBPF dependencies when writeSyscall is configured - if spec.WriteSyscall != nil { - if config.CmdFactory == nil { - config.CmdFactory = command.NewFactory(config.Disruption.DryRun) - } - - if config.ProcessManager == nil { - config.ProcessManager = process.NewManager(config.Disruption.DryRun) - } - - if config.BPFConfigInformer == nil { - var err error - - config.BPFConfigInformer, err = ebpf.NewConfigInformer(config.Log, config.Disruption.DryRun, nil, nil, nil) - if err != nil { - return nil, fmt.Errorf("could not create an instance of eBPF config informer for the disk full disruption: %w", err) - } - } - } - ballastPath := filepath.Join(hostPath, ballastFilePrefix+config.Disruption.DisruptionName) return &diskFullInjector{ @@ -123,19 +91,7 @@ func (i *diskFullInjector) GetDisruptionKind() types.DisruptionKindName { } func (i *diskFullInjector) Inject() error { - // Phase 1: Volume fill - if err := i.injectVolumeFill(); err != nil { - return err - } - - // Phase 2: Optional eBPF write syscall interception - if i.spec.WriteSyscall != nil { - if err := i.injectWriteSyscall(); err != nil { - return err - } - } - - return nil + return i.injectVolumeFill() } func (i *diskFullInjector) injectVolumeFill() error { @@ -225,49 +181,6 @@ func (i *diskFullInjector) injectVolumeFill() error { return nil } -func (i *diskFullInjector) injectWriteSyscall() error { - if err := i.config.BPFConfigInformer.ValidateRequiredSystemConfig(); err != nil { - return fmt.Errorf("the disk full write syscall interception needs a kernel supporting eBPF programs: %w", err) - } - - if !i.config.BPFConfigInformer.GetMapTypes().HavePerfEventArrayMapType { - return fmt.Errorf("the disk full write syscall interception needs the perf event array map type, but the current kernel does not support this type of map") - } - - pid := 0 - if i.config.Disruption.Level == types.DisruptionLevelPod { - pid = int(i.config.TargetContainer.PID()) - } - - exitCode := i.spec.WriteSyscall.GetExitCodeInt() - - probability := "100" - if i.spec.WriteSyscall.Probability != "" { - probability = strings.TrimSuffix(i.spec.WriteSyscall.Probability, "%") - } - - args := []string{ - "-process", strconv.Itoa(pid), - "-exit-code", strconv.Itoa(exitCode), - "-probability", probability, - } - - i.config.Log.Infow("starting eBPF write syscall interception", - "pid", pid, - "exitCode", i.spec.WriteSyscall.ExitCode, - "probability", probability, - ) - - cmd := i.config.CmdFactory.NewCmd(context.Background(), EBPFDiskFullWriteCmd, args) - - bgCmd := command.NewBackgroundCmd(cmd, i.config.Log, i.config.ProcessManager) - if err := bgCmd.Start(); err != nil { - return fmt.Errorf("unable to run the eBPF disk full write interception: %w", err) - } - - return nil -} - func (i *diskFullInjector) computeBytesToFill(totalBytes, availableBytes uint64) (uint64, error) { if i.spec.Capacity != "" { percentStr := strings.TrimSuffix(i.spec.Capacity, "%") From fb46e353a2e49b7fb8446e0744a9dcd5ba338e49 Mon Sep 17 00:00:00 2001 From: Thibault NORMAND Date: Thu, 16 Apr 2026 17:04:34 +0200 Subject: [PATCH 4/4] fix(disk-full): wire diskFull into validation, counting, explain, and address PR comments. Add diskFull to 5 missing registration points in validateGlobalDisruptionScope (at-least-one-kind check, ContainerFailure/NodeFailure/PodReplacement compatibility, OnInit compatibility), DisruptionCount(), and Explain(). Add writable shadow mount for the target path in chaos pod spec so the injector can write ballast files while keeping /mnt/host read-only. Add capacity mode test coverage, disk_full example, complete.yaml entry, and docs/README.md link. Co-Authored-By: Claude Opus 4.6 (1M context) --- api/v1beta1/disruption_types.go | 17 +++++++++++++---- builderstest/chaospod.go | 25 +++++++++++++++++++++++++ docs/README.md | 1 + examples/complete.yaml | 4 ++++ examples/disk_full.yaml | 19 +++++++++++++++++++ injector/disk_full_test.go | 18 ++++++++++++++++++ services/chaospod.go | 29 +++++++++++++++++++++++++++-- 7 files changed, 107 insertions(+), 6 deletions(-) create mode 100644 examples/disk_full.yaml diff --git a/api/v1beta1/disruption_types.go b/api/v1beta1/disruption_types.go index 5c8c68c8fd..8ae20059b5 100644 --- a/api/v1beta1/disruption_types.go +++ b/api/v1beta1/disruption_types.go @@ -698,25 +698,25 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re } // Rule: At least one disruption kind must be applied - if s.CPUPressure == nil && s.MemoryPressure == nil && s.DiskPressure == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.DNS == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil { + if s.CPUPressure == nil && s.MemoryPressure == nil && s.DiskPressure == nil && s.DiskFull == nil && s.DiskFailure == nil && s.Network == nil && s.GRPC == nil && s.DNS == nil && s.ContainerFailure == nil && s.NodeFailure == nil && s.PodReplacement == nil { retErr = multierror.Append(retErr, errors.New("at least one disruption kind must be specified, please read the docs to see your options")) } // Rule: ContainerFailure, NodeFailure, and PodReplacement disruptions are not compatible with other failure types if s.ContainerFailure != nil { - if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.NodeFailure != nil || s.PodReplacement != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFull != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.NodeFailure != nil || s.PodReplacement != nil { retErr = multierror.Append(retErr, errors.New("container failure disruptions are not compatible with other disruption kinds. The container failure will remove the impact of the other disruption types")) } } if s.NodeFailure != nil { - if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.PodReplacement != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFull != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.PodReplacement != nil { retErr = multierror.Append(retErr, errors.New("node failure disruptions are not compatible with other disruption kinds. The node failure will remove the impact of the other disruption types")) } } if s.PodReplacement != nil { - if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.NodeFailure != nil { + if s.CPUPressure != nil || s.MemoryPressure != nil || s.DiskPressure != nil || s.DiskFull != nil || s.DiskFailure != nil || s.Network != nil || s.GRPC != nil || s.DNS != nil || s.ContainerFailure != nil || s.NodeFailure != nil { retErr = multierror.Append(retErr, errors.New("pod replacement disruptions are not compatible with other disruption kinds. The pod replacement will remove the impact of the other disruption types")) } // Rule: container failure not possible if disruption is node-level @@ -733,6 +733,7 @@ func (s DisruptionSpec) validateGlobalDisruptionScope(requireSelectors bool) (re s.PodReplacement != nil || s.ContainerFailure != nil || s.DiskPressure != nil || + s.DiskFull != nil || s.GRPC != nil || s.DiskFailure != nil { retErr = multierror.Append(retErr, errors.New("OnInit is only compatible with network disruptions")) @@ -938,6 +939,10 @@ func (s DisruptionSpec) DisruptionCount() int { count++ } + if s.DiskFull != nil { + count++ + } + return count } @@ -1094,6 +1099,10 @@ func (s DisruptionSpec) Explain() []string { explanation = append(explanation, s.DiskFailure.Explain()...) } + if s.DiskFull != nil { + explanation = append(explanation, s.DiskFull.Explain()...) + } + if s.GRPC != nil { explanation = append(explanation, s.GRPC.Explain()...) } diff --git a/builderstest/chaospod.go b/builderstest/chaospod.go index 734e7451d5..57641654f7 100644 --- a/builderstest/chaospod.go +++ b/builderstest/chaospod.go @@ -6,6 +6,7 @@ package builderstest_test import ( + "path/filepath" "time" "github.com/DataDog/chaos-controller/env" @@ -175,6 +176,30 @@ func (b *ChaosPodBuilder) WithPullSecrets(imagePullSecrets []v1.LocalObjectRefer return b } +// WithDiskFullMount adds the writable shadow mount for disk-full disruptions. +func (b *ChaosPodBuilder) WithDiskFullMount(diskFullPath string) *ChaosPodBuilder { + b.modifiers = append( + b.modifiers, + func() { + hostPathDirectory := v1.HostPathDirectory + b.Spec.Volumes = append(b.Spec.Volumes, v1.Volume{ + Name: "disk-full-target", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: diskFullPath, + Type: &hostPathDirectory, + }, + }, + }) + b.Spec.Containers[0].VolumeMounts = append(b.Spec.Containers[0].VolumeMounts, v1.VolumeMount{ + Name: "disk-full-target", + MountPath: filepath.Join("/mnt/host", diskFullPath), + }) + }) + + return b +} + // WithChaosSpec sets the chaos-specific pod spec. func (b *ChaosPodBuilder) WithChaosSpec(targetNodeName string, terminationGracePeriod, activeDeadlineSeconds int64, args []string, hostPathDirectory, pathFile v1.HostPathType, serviceAccountName string, image string, logLevel string) *ChaosPodBuilder { b.modifiers = append( diff --git a/docs/README.md b/docs/README.md index 157cbe7ac9..72f9343ef5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -22,6 +22,7 @@ This folder contains the detailed documentation of most parts of the project. - [CPU Pressure](cpu_pressure.md) - [Memory Pressure](memory_disruption.md) - [Disk Failure](disk_failure.md) + - [Disk Full](disk_full.md) - [Disk Pressure](disk_pressure.md) - [DNS Disruption](dns_disruption.md) - [GRPC Disruption](grpc_disruption.md) diff --git a/examples/complete.yaml b/examples/complete.yaml index b51c3295df..ea4235756e 100644 --- a/examples/complete.yaml +++ b/examples/complete.yaml @@ -111,6 +111,10 @@ spec: throttling: readBytesPerSec: 1024 # optional, read throttling in bytes per sec writeBytesPerSec: 2048 # optional, write throttling in bytes per sec + diskFull: # disk full (ENOSPC) disruption + path: /mnt/data # mount path inside the target pod to fill (e.g., "/data", "/var/log") + capacity: "95%" # optional (mutually exclusive with remaining), target fill percentage of total volume capacity + # remaining: "50Mi" # optional (mutually exclusive with capacity), amount of free space to leave on the volume (e.g., "50Mi", "1Gi") grpc: # disrupt gRPC responses by faking results port: 50051 # port that target grpc server is listening on endpoints: diff --git a/examples/disk_full.yaml b/examples/disk_full.yaml new file mode 100644 index 0000000000..0f93219ba1 --- /dev/null +++ b/examples/disk_full.yaml @@ -0,0 +1,19 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2026 Datadog, Inc. + +apiVersion: chaos.datadoghq.com/v1beta1 +kind: Disruption +metadata: + name: disk-full + namespace: chaos-demo +spec: + level: pod + selector: + service: demo-curl + count: 1 + duration: 10m + diskFull: + path: "/mnt/data" + capacity: "95%" diff --git a/injector/disk_full_test.go b/injector/disk_full_test.go index 478f781e7f..bec855da43 100644 --- a/injector/disk_full_test.go +++ b/injector/disk_full_test.go @@ -126,6 +126,24 @@ var _ = Describe("DiskFull", func() { }) }) + Context("with capacity mode", func() { + BeforeEach(func() { + // Use capacity mode: fill to 95% — allocate a tiny amount relative to the disk + spec.Capacity = "95%" + spec.Remaining = "" + }) + + It("should create a ballast file when disk is below target capacity", func() { + err := inj.Inject() + Expect(err).ToNot(HaveOccurred()) + + ballastPath := filepath.Join(tmpDir, ".chaos-diskfull-test-disruption") + info, statErr := os.Stat(ballastPath) + Expect(statErr).ToNot(HaveOccurred()) + Expect(info.Size()).To(BeNumerically(">", 0)) + }) + }) + Context("with remaining larger than available space", func() { BeforeEach(func() { spec.Capacity = "" diff --git a/services/chaospod.go b/services/chaospod.go index 317af67ab6..471cf73b98 100644 --- a/services/chaospod.go +++ b/services/chaospod.go @@ -8,6 +8,7 @@ package services import ( "context" "fmt" + "path/filepath" "reflect" "strings" "time" @@ -322,6 +323,11 @@ func (m *chaosPodService) GenerateChaosPodOfDisruption(disruption *chaosv1beta1. args = append(args, "--deadline", time.Now().Add(chaostypes.InjectorPadDuration).Add(disruption.RemainingDuration()).Format(time.RFC3339)) + var diskFullPath string + if kind == chaostypes.DisruptionKindDiskFull && disruption.Spec.DiskFull != nil { + diskFullPath = disruption.Spec.DiskFull.Path + } + chaosPod = corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: fmt.Sprintf("chaos-%s-", disruption.Name), // generate the pod name automatically with a prefix @@ -329,7 +335,7 @@ func (m *chaosPodService) GenerateChaosPodOfDisruption(disruption *chaosv1beta1. Annotations: m.config.Injector.Annotations, // add extra annotations passed to the controller Labels: m.generateLabels(disruption, targetName, kind), // add default and extra podLabels passed to the controller }, - Spec: m.generateChaosPodSpec(targetNodeName, terminationGracePeriod, activeDeadlineSeconds, args, hostPathDirectory, hostPathFile), + Spec: m.generateChaosPodSpec(targetNodeName, terminationGracePeriod, activeDeadlineSeconds, args, hostPathDirectory, hostPathFile, diskFullPath), } // add finalizer to the pod, so it is not deleted before we can control its exit status @@ -463,7 +469,7 @@ func (m *chaosPodService) generateLabels(disruption *chaosv1beta1.Disruption, ta return podLabels } -func (m *chaosPodService) generateChaosPodSpec(targetNodeName string, terminationGracePeriod int64, activeDeadlineSeconds int64, args []string, hostPathDirectory corev1.HostPathType, hostPathFile corev1.HostPathType) corev1.PodSpec { +func (m *chaosPodService) generateChaosPodSpec(targetNodeName string, terminationGracePeriod int64, activeDeadlineSeconds int64, args []string, hostPathDirectory corev1.HostPathType, hostPathFile corev1.HostPathType, diskFullPath string) corev1.PodSpec { podSpec := corev1.PodSpec{ HostPID: true, // enable host pid RestartPolicy: corev1.RestartPolicyNever, // do not restart the pod on fail or completion @@ -655,6 +661,25 @@ func (m *chaosPodService) generateChaosPodSpec(targetNodeName string, terminatio } } + // For disk-full disruptions, add a writable shadow mount for only the target path. + // The host root at /mnt/host stays read-only; this mounts the specific target directory + // as writable at /mnt/host/, so the injector's path resolution works unchanged. + if diskFullPath != "" { + podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{ + Name: "disk-full-target", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: diskFullPath, + Type: &hostPathDirectory, + }, + }, + }) + podSpec.Containers[0].VolumeMounts = append(podSpec.Containers[0].VolumeMounts, corev1.VolumeMount{ + Name: "disk-full-target", + MountPath: filepath.Join("/mnt/host", diskFullPath), + }) + } + return podSpec }