diff --git a/receiver/systemmetricsreceiver/scraper_disk_linux.go b/receiver/systemmetricsreceiver/scraper_disk_linux.go index 6e45bbd834..2def08bc8d 100644 --- a/receiver/systemmetricsreceiver/scraper_disk_linux.go +++ b/receiver/systemmetricsreceiver/scraper_disk_linux.go @@ -9,6 +9,7 @@ import ( "context" "time" + "github.com/shirou/gopsutil/v3/disk" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" @@ -17,6 +18,13 @@ import ( const ( metricAggDiskUsed = "aggregate_disk_used" metricAggDiskFree = "aggregate_disk_free" + + // maxPlausibleMountBytes is the upper bound for a single mount's stats (1 PiB). + // The largest EBS volume is 64 TiB, so this is ~16x above anything legitimate. + // The gopsutil disk.UsageWithContext computes Total/Free as uint64(stat.Blocks) * + // uint64(stat.Bsize). Certain filesystem states (transient loop mounts, broken + // statvfs under FS pressure) can produce huge garbage values. + maxPlausibleMountBytes uint64 = 1 << 50 ) type diskScraper struct { @@ -42,6 +50,14 @@ func (s *diskScraper) Scrape(ctx context.Context, metrics pmetric.Metrics) error var totalUsed, totalFree uint64 for _, du := range parts { + if !isPlausibleDiskUsage(du) { + s.logger.Debug("Dropping disk sample: mount with implausible stats", + zap.String("path", du.Path), + zap.Uint64("total", du.Total), + zap.Uint64("free", du.Free), + zap.Uint64("used", du.Used)) + return nil // drop entire sample to avoid poisoning min/max rollups + } totalUsed += du.Used totalFree += du.Free } @@ -53,3 +69,12 @@ func (s *diskScraper) Scrape(ctx context.Context, metrics pmetric.Metrics) error addGaugeDP(sm.Metrics().AppendEmpty(), metricAggDiskFree, "Bytes", float64(totalFree), now) return nil } + +// isPlausibleDiskUsage returns true if the usage stats are physically plausible. +// Rejects mounts where statvfs returned garbage (huge values, Free > Total, etc). +func isPlausibleDiskUsage(du *disk.UsageStat) bool { + return du.Total > 0 && + du.Free <= du.Total && + du.Total <= maxPlausibleMountBytes && + du.Free <= maxPlausibleMountBytes +} diff --git a/receiver/systemmetricsreceiver/scraper_disk_linux_test.go b/receiver/systemmetricsreceiver/scraper_disk_linux_test.go index 84bdd98324..f263fb1a9a 100644 --- a/receiver/systemmetricsreceiver/scraper_disk_linux_test.go +++ b/receiver/systemmetricsreceiver/scraper_disk_linux_test.go @@ -24,8 +24,8 @@ func TestDiskScraperName(t *testing.T) { func TestDiskScraperMetrics(t *testing.T) { ps := &MockPS{DiskUsageData: []*disk.UsageStat{ - {Path: "/", Used: 50 * 1048576, Free: 78 * 1048576}, // 50 MB used, 78 MB free - {Path: "/home", Used: 100 * 1048576, Free: 200 * 1048576}, // 100 MB used, 200 MB free + {Path: "/", Total: 128 * 1048576, Used: 50 * 1048576, Free: 78 * 1048576}, + {Path: "/home", Total: 300 * 1048576, Used: 100 * 1048576, Free: 200 * 1048576}, }} s := newDiskScraper(zap.NewNop(), ps) @@ -36,12 +36,10 @@ func TestDiskScraperMetrics(t *testing.T) { sm := metrics.ResourceMetrics().At(0).ScopeMetrics().At(0) require.Equal(t, 2, sm.Metrics().Len()) - // DiskUsed = (50 + 100) * 1048576 bytes assert.Equal(t, "aggregate_disk_used", sm.Metrics().At(0).Name()) assert.Equal(t, "Bytes", sm.Metrics().At(0).Unit()) assert.InDelta(t, 150*1048576.0, sm.Metrics().At(0).Gauge().DataPoints().At(0).DoubleValue(), 0.01) - // DiskFree = (78 + 200) * 1048576 bytes assert.Equal(t, "aggregate_disk_free", sm.Metrics().At(1).Name()) assert.Equal(t, "Bytes", sm.Metrics().At(1).Unit()) assert.InDelta(t, 278*1048576.0, sm.Metrics().At(1).Gauge().DataPoints().At(0).DoubleValue(), 0.01) @@ -64,3 +62,63 @@ func TestDiskScraperErrorSkips(t *testing.T) { require.NoError(t, s.Scrape(context.Background(), metrics)) assert.Equal(t, 0, metrics.ResourceMetrics().Len()) } + +// ---- plausibility guard tests ---- + +func TestDiskScraperDropsSampleWhenFreeExceedsTotal(t *testing.T) { + // A transient mount reports tiny total but huge free from a + // statvfs signed-multiply overflow. + ps := &MockPS{DiskUsageData: []*disk.UsageStat{ + {Path: "/", Total: 1_000_000_000, Used: 500_000_000, Free: 500_000_000}, + {Path: "/bad", Total: 4096, Used: 0, Free: 9_223_372_036_854_755_328}, + }} + s := newDiskScraper(zap.NewNop(), ps) + + metrics := pmetric.NewMetrics() + require.NoError(t, s.Scrape(context.Background(), metrics)) + assert.Equal(t, 0, metrics.ResourceMetrics().Len(), "sample must be dropped when any mount has Free > Total") +} + +func TestDiskScraperDropsSampleWhenTotalExceedsCap(t *testing.T) { + ps := &MockPS{DiskUsageData: []*disk.UsageStat{ + {Path: "/", Total: 1_000_000_000, Used: 500_000_000, Free: 500_000_000}, + {Path: "/huge", Total: 1 << 51, Used: 1 << 50, Free: 1 << 50}, + }} + s := newDiskScraper(zap.NewNop(), ps) + + metrics := pmetric.NewMetrics() + require.NoError(t, s.Scrape(context.Background(), metrics)) + assert.Equal(t, 0, metrics.ResourceMetrics().Len(), "sample must be dropped when mount exceeds 1 PiB cap") +} + +func TestDiskScraperDropsSampleWhenTotalIsZero(t *testing.T) { + ps := &MockPS{DiskUsageData: []*disk.UsageStat{ + {Path: "/", Total: 1_000_000_000, Used: 500_000_000, Free: 500_000_000}, + {Path: "/broken", Total: 0, Used: 0, Free: 0}, + }} + s := newDiskScraper(zap.NewNop(), ps) + + metrics := pmetric.NewMetrics() + require.NoError(t, s.Scrape(context.Background(), metrics)) + assert.Equal(t, 0, metrics.ResourceMetrics().Len(), "sample must be dropped when mount has zero total") +} + +func TestIsPlausibleDiskUsage(t *testing.T) { + tests := []struct { + name string + usage *disk.UsageStat + expect bool + }{ + {"normal", &disk.UsageStat{Total: 1e9, Free: 5e8}, true}, + {"full disk", &disk.UsageStat{Total: 1e9, Free: 0}, true}, + {"at 1 PiB cap", &disk.UsageStat{Total: 1 << 50, Free: 1 << 50}, true}, + {"zero total", &disk.UsageStat{Total: 0, Free: 0}, false}, + {"free > total", &disk.UsageStat{Total: 4096, Free: 9_223_372_036_854_755_328}, false}, + {"above cap", &disk.UsageStat{Total: 1<<50 + 1, Free: 0}, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expect, isPlausibleDiskUsage(tt.usage)) + }) + } +}