Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions receiver/systemmetricsreceiver/scraper_disk_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"context"
"time"

"github.com/shirou/gopsutil/v3/disk"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/pmetric"
"go.uber.org/zap"
Expand All @@ -17,6 +18,13 @@ import (
const (
metricAggDiskUsed = "aggregate_disk_used"
metricAggDiskFree = "aggregate_disk_free"

// maxPlausibleMountBytes is the upper bound for a single mount's stats (1 PiB).
// The largest EBS volume is 64 TiB, so this is ~16x above anything legitimate.
// The gopsutil disk.UsageWithContext computes Total/Free as uint64(stat.Blocks) *
// uint64(stat.Bsize). Certain filesystem states (transient loop mounts, broken
// statvfs under FS pressure) can produce huge garbage values.
maxPlausibleMountBytes uint64 = 1 << 50
)

type diskScraper struct {
Expand All @@ -42,6 +50,14 @@ func (s *diskScraper) Scrape(ctx context.Context, metrics pmetric.Metrics) error

var totalUsed, totalFree uint64
for _, du := range parts {
if !isPlausibleDiskUsage(du) {
s.logger.Debug("Dropping disk sample: mount with implausible stats",
zap.String("path", du.Path),
zap.Uint64("total", du.Total),
zap.Uint64("free", du.Free),
zap.Uint64("used", du.Used))
return nil // drop entire sample to avoid poisoning min/max rollups
}
totalUsed += du.Used
totalFree += du.Free
}
Expand All @@ -53,3 +69,12 @@ func (s *diskScraper) Scrape(ctx context.Context, metrics pmetric.Metrics) error
addGaugeDP(sm.Metrics().AppendEmpty(), metricAggDiskFree, "Bytes", float64(totalFree), now)
return nil
}

// isPlausibleDiskUsage returns true if the usage stats are physically plausible.
// Rejects mounts where statvfs returned garbage (huge values, Free > Total, etc).
func isPlausibleDiskUsage(du *disk.UsageStat) bool {
return du.Total > 0 &&
du.Free <= du.Total &&
du.Total <= maxPlausibleMountBytes &&
du.Free <= maxPlausibleMountBytes
}
66 changes: 62 additions & 4 deletions receiver/systemmetricsreceiver/scraper_disk_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ func TestDiskScraperName(t *testing.T) {

func TestDiskScraperMetrics(t *testing.T) {
ps := &MockPS{DiskUsageData: []*disk.UsageStat{
{Path: "/", Used: 50 * 1048576, Free: 78 * 1048576}, // 50 MB used, 78 MB free
{Path: "/home", Used: 100 * 1048576, Free: 200 * 1048576}, // 100 MB used, 200 MB free
{Path: "/", Total: 128 * 1048576, Used: 50 * 1048576, Free: 78 * 1048576},
{Path: "/home", Total: 300 * 1048576, Used: 100 * 1048576, Free: 200 * 1048576},
}}
s := newDiskScraper(zap.NewNop(), ps)

Expand All @@ -36,12 +36,10 @@ func TestDiskScraperMetrics(t *testing.T) {
sm := metrics.ResourceMetrics().At(0).ScopeMetrics().At(0)
require.Equal(t, 2, sm.Metrics().Len())

// DiskUsed = (50 + 100) * 1048576 bytes
assert.Equal(t, "aggregate_disk_used", sm.Metrics().At(0).Name())
assert.Equal(t, "Bytes", sm.Metrics().At(0).Unit())
assert.InDelta(t, 150*1048576.0, sm.Metrics().At(0).Gauge().DataPoints().At(0).DoubleValue(), 0.01)

// DiskFree = (78 + 200) * 1048576 bytes
assert.Equal(t, "aggregate_disk_free", sm.Metrics().At(1).Name())
assert.Equal(t, "Bytes", sm.Metrics().At(1).Unit())
assert.InDelta(t, 278*1048576.0, sm.Metrics().At(1).Gauge().DataPoints().At(0).DoubleValue(), 0.01)
Expand All @@ -64,3 +62,63 @@ func TestDiskScraperErrorSkips(t *testing.T) {
require.NoError(t, s.Scrape(context.Background(), metrics))
assert.Equal(t, 0, metrics.ResourceMetrics().Len())
}

// ---- plausibility guard tests ----

func TestDiskScraperDropsSampleWhenFreeExceedsTotal(t *testing.T) {
// A transient mount reports tiny total but huge free from a
// statvfs signed-multiply overflow.
ps := &MockPS{DiskUsageData: []*disk.UsageStat{
{Path: "/", Total: 1_000_000_000, Used: 500_000_000, Free: 500_000_000},
{Path: "/bad", Total: 4096, Used: 0, Free: 9_223_372_036_854_755_328},
}}
s := newDiskScraper(zap.NewNop(), ps)

metrics := pmetric.NewMetrics()
require.NoError(t, s.Scrape(context.Background(), metrics))
assert.Equal(t, 0, metrics.ResourceMetrics().Len(), "sample must be dropped when any mount has Free > Total")
}

func TestDiskScraperDropsSampleWhenTotalExceedsCap(t *testing.T) {
ps := &MockPS{DiskUsageData: []*disk.UsageStat{
{Path: "/", Total: 1_000_000_000, Used: 500_000_000, Free: 500_000_000},
{Path: "/huge", Total: 1 << 51, Used: 1 << 50, Free: 1 << 50},
}}
s := newDiskScraper(zap.NewNop(), ps)

metrics := pmetric.NewMetrics()
require.NoError(t, s.Scrape(context.Background(), metrics))
assert.Equal(t, 0, metrics.ResourceMetrics().Len(), "sample must be dropped when mount exceeds 1 PiB cap")
}

func TestDiskScraperDropsSampleWhenTotalIsZero(t *testing.T) {
ps := &MockPS{DiskUsageData: []*disk.UsageStat{
{Path: "/", Total: 1_000_000_000, Used: 500_000_000, Free: 500_000_000},
{Path: "/broken", Total: 0, Used: 0, Free: 0},
}}
s := newDiskScraper(zap.NewNop(), ps)

metrics := pmetric.NewMetrics()
require.NoError(t, s.Scrape(context.Background(), metrics))
assert.Equal(t, 0, metrics.ResourceMetrics().Len(), "sample must be dropped when mount has zero total")
}

func TestIsPlausibleDiskUsage(t *testing.T) {
tests := []struct {
name string
usage *disk.UsageStat
expect bool
}{
{"normal", &disk.UsageStat{Total: 1e9, Free: 5e8}, true},
{"full disk", &disk.UsageStat{Total: 1e9, Free: 0}, true},
{"at 1 PiB cap", &disk.UsageStat{Total: 1 << 50, Free: 1 << 50}, true},
{"zero total", &disk.UsageStat{Total: 0, Free: 0}, false},
{"free > total", &disk.UsageStat{Total: 4096, Free: 9_223_372_036_854_755_328}, false},
{"above cap", &disk.UsageStat{Total: 1<<50 + 1, Free: 0}, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expect, isPlausibleDiskUsage(tt.usage))
})
}
}
Loading