Skip to content

Commit b8e7428

Browse files
committed
client: fixed a bug where the bandwidth of reserved cores were not taken into account
1 parent 48863bd commit b8e7428

File tree

8 files changed

+159
-30
lines changed

8 files changed

+159
-30
lines changed

.changelog/26768.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
client: fixed a bug where the bandwidth of reserved cores were not taken into account
3+
```

client/client.go

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1816,8 +1816,19 @@ func (c *Client) updateNodeFromFingerprint(response *fingerprint.FingerprintResp
18161816
}
18171817

18181818
// update config with total cpu compute if it was detected
1819-
if cpu := response.NodeResources.Processors.TotalCompute(); cpu > 0 {
1820-
newConfig.CpuCompute = cpu
1819+
totalCompute := response.NodeResources.Processors.TotalCompute()
1820+
usableCompute := response.NodeResources.Processors.UsableCompute()
1821+
if totalCompute > 0 {
1822+
if newConfig.CpuCompute != totalCompute {
1823+
newConfig.CpuCompute = totalCompute
1824+
nodeHasChanged = true
1825+
}
1826+
1827+
reservedCompute := int64(totalCompute - usableCompute)
1828+
if newConfig.Node.ReservedResources.Cpu.CpuShares != reservedCompute {
1829+
newConfig.Node.ReservedResources.Cpu.CpuShares = reservedCompute
1830+
nodeHasChanged = true
1831+
}
18211832
}
18221833
}
18231834

@@ -3323,8 +3334,9 @@ func (c *Client) setGaugeForDiskStats(hStats *hoststats.HostStats, baseLabels []
33233334
// setGaugeForAllocationStats proxies metrics for allocation specific statistics
33243335
func (c *Client) setGaugeForAllocationStats(baseLabels []metrics.Label) {
33253336
node := c.GetConfig().Node
3326-
total := node.NodeResources
3327-
res := node.ReservedResources
3337+
3338+
available := node.NodeResources.Comparable()
3339+
available.Subtract(node.ReservedResources.Comparable())
33283340
allocated := c.getAllocatedResources(node)
33293341

33303342
// Emit allocated
@@ -3342,20 +3354,15 @@ func (c *Client) setGaugeForAllocationStats(baseLabels []metrics.Label) {
33423354
}
33433355

33443356
// Emit unallocated
3345-
unallocatedMem := total.Memory.MemoryMB - res.Memory.MemoryMB - allocated.Flattened.Memory.MemoryMB
3346-
unallocatedDisk := total.Disk.DiskMB - res.Disk.DiskMB - allocated.Shared.DiskMB
3347-
3348-
// The UsableCompute function call already subtracts and accounts for any
3349-
// reserved CPU within the client configuration. Therefore, we do not need
3350-
// to subtract that here.
3351-
unallocatedCpu := int64(total.Processors.Topology.UsableCompute()) - allocated.Flattened.Cpu.CpuShares
3357+
unallocatedMem := available.Flattened.Memory.MemoryMB - allocated.Flattened.Memory.MemoryMB
3358+
unallocatedDisk := available.Shared.DiskMB - allocated.Shared.DiskMB
3359+
unallocatedCpu := available.Flattened.Cpu.CpuShares - allocated.Flattened.Cpu.CpuShares
33523360

33533361
metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), baseLabels)
33543362
metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), baseLabels)
33553363
metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), baseLabels)
33563364

3357-
totalComparable := total.Comparable()
3358-
for _, n := range totalComparable.Flattened.Networks {
3365+
for _, n := range available.Flattened.Networks {
33593366
// Determined the used resources
33603367
var usedMbits int
33613368
totalIdx := allocated.Flattened.Networks.NetIndex(n)

client/client_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,54 @@ func TestClient_UpdateNodeFromDevicesAccumulates(t *testing.T) {
16881688

16891689
}
16901690

1691+
func TestClient_UpdateNodeFromFingerprintCalculatesReservedResources(t *testing.T) {
1692+
ci.Parallel(t)
1693+
1694+
// Client without network configured updates to match fingerprint
1695+
client, cleanup := TestClient(t, func(c *config.Config) {
1696+
c.Node.ReservedResources.Cpu = structs.NodeReservedCpuResources{
1697+
CpuShares: 100,
1698+
ReservedCpuCores: []uint16{0},
1699+
}
1700+
})
1701+
defer cleanup()
1702+
1703+
// Set up a basic topology where the first core is and an
1704+
// additionally 100 MHz are reserved
1705+
basicTopology := structs.MockBasicTopology()
1706+
basicTopology.Cores[0].Disable = true
1707+
basicTopology.OverrideWitholdCompute = 100
1708+
client.updateNodeFromFingerprint(&fingerprint.FingerprintResponse{
1709+
// overrides the detected hardware in TestClient
1710+
NodeResources: &structs.NodeResources{
1711+
Memory: structs.NodeMemoryResources{MemoryMB: 1024},
1712+
Processors: structs.NodeProcessorResources{
1713+
Topology: basicTopology,
1714+
},
1715+
},
1716+
})
1717+
1718+
// initial check
1719+
conf := client.GetConfig()
1720+
1721+
expectedReservedResources := &structs.NodeReservedResources{
1722+
Cpu: structs.NodeReservedCpuResources{
1723+
CpuShares: 3_600, // 1 * 3500 MHz and 100 MHz
1724+
ReservedCpuCores: []uint16{0},
1725+
},
1726+
Memory: structs.NodeReservedMemoryResources{
1727+
MemoryMB: 256,
1728+
},
1729+
Disk: structs.NodeReservedDiskResources{
1730+
DiskMB: 4096,
1731+
},
1732+
Networks: structs.NodeReservedNetworkResources{
1733+
ReservedHostPorts: "22",
1734+
},
1735+
}
1736+
must.Eq(t, expectedReservedResources, conf.Node.ReservedResources)
1737+
}
1738+
16911739
// TestClient_UpdateNodeFromFingerprintKeepsConfig asserts manually configured
16921740
// network interfaces take precedence over fingerprinted ones.
16931741
func TestClient_UpdateNodeFromFingerprintKeepsConfig(t *testing.T) {

client/lib/numalib/topology.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,16 @@ func (st *Topology) NumECores() int {
282282
return total
283283
}
284284

285+
// TotalCores returns the number of logical cores detected. The UsableCores will
286+
// be equal to or less than this value.
287+
func (st *Topology) TotalCores() *idset.Set[hw.CoreID] {
288+
result := idset.Empty[hw.CoreID]()
289+
for _, cpu := range st.Cores {
290+
result.Insert(cpu.ID)
291+
}
292+
return result
293+
}
294+
285295
// UsableCores returns the number of logical cores usable by the Nomad client
286296
// for running tasks. Nomad must subtract off any reserved cores (reserved.cores)
287297
// and/or must mask the cpuset to the one set in config (config.reservable_cores).

nomad/structs/funcs.go

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -232,20 +232,13 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
232232
}
233233

234234
func computeFreePercentage(node *Node, util *ComparableResources) (freePctCpu, freePctRam float64) {
235-
reserved := node.ReservedResources.Comparable()
236-
res := node.NodeResources.Comparable()
237-
238235
// Determine the node availability
239-
nodeCpu := float64(res.Flattened.Cpu.CpuShares)
240-
nodeMem := float64(res.Flattened.Memory.MemoryMB)
241-
if reserved != nil {
242-
nodeCpu -= float64(reserved.Flattened.Cpu.CpuShares)
243-
nodeMem -= float64(reserved.Flattened.Memory.MemoryMB)
244-
}
236+
available := node.NodeResources.Comparable()
237+
available.Subtract(node.ReservedResources.Comparable())
245238

246239
// Compute the free percentage
247-
freePctCpu = 1 - (float64(util.Flattened.Cpu.CpuShares) / nodeCpu)
248-
freePctRam = 1 - (float64(util.Flattened.Memory.MemoryMB) / nodeMem)
240+
freePctCpu = 1 - (float64(util.Flattened.Cpu.CpuShares) / float64(available.Flattened.Cpu.CpuShares))
241+
freePctRam = 1 - (float64(util.Flattened.Memory.MemoryMB) / float64(available.Flattened.Memory.MemoryMB))
249242
return freePctCpu, freePctRam
250243
}
251244

nomad/structs/funcs_test.go

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,22 @@ func node2k() *Node {
102102
ID: 0,
103103
Grade: numalib.Performance,
104104
BaseSpeed: 1000,
105+
Disable: false,
105106
}, {
106107
ID: 1,
107108
Grade: numalib.Performance,
108109
BaseSpeed: 1000,
110+
Disable: false,
111+
}, {
112+
ID: 2,
113+
Grade: numalib.Performance,
114+
BaseSpeed: 1000,
115+
Disable: true,
116+
}, {
117+
ID: 3,
118+
Grade: numalib.Performance,
119+
BaseSpeed: 1000,
120+
Disable: true,
109121
}},
110122
OverrideWitholdCompute: 1000, // set by client reserved field
111123
},
@@ -137,7 +149,8 @@ func node2k() *Node {
137149
},
138150
ReservedResources: &NodeReservedResources{
139151
Cpu: NodeReservedCpuResources{
140-
CpuShares: 1000,
152+
CpuShares: 3000, // set by fingerprinting callback
153+
ReservedCpuCores: []uint16{2, 3},
141154
},
142155
Memory: NodeReservedMemoryResources{
143156
MemoryMB: 1024,
@@ -248,6 +261,38 @@ func TestAllocsFit(t *testing.T) {
248261
must.Eq(t, 1000, used.Flattened.Cpu.CpuShares)
249262
must.Eq(t, []uint16{0}, used.Flattened.Cpu.ReservedCores)
250263
must.Eq(t, 1024, used.Flattened.Memory.MemoryMB)
264+
265+
a3 := &Allocation{
266+
AllocatedResources: &AllocatedResources{
267+
Tasks: map[string]*AllocatedTaskResources{
268+
"web": {
269+
Cpu: AllocatedCpuResources{
270+
CpuShares: 1000,
271+
},
272+
Memory: AllocatedMemoryResources{
273+
MemoryMB: 512,
274+
},
275+
},
276+
},
277+
},
278+
}
279+
280+
// Should fit one allocation
281+
fit, dim, used, err = AllocsFit(n, []*Allocation{a3}, nil, false)
282+
must.NoError(t, err)
283+
must.True(t, fit, must.Sprintf("failed for dimension %q", dim))
284+
must.Eq(t, 1000, used.Flattened.Cpu.CpuShares)
285+
must.Eq(t, []uint16{}, used.Flattened.Cpu.ReservedCores)
286+
must.Eq(t, 512, used.Flattened.Memory.MemoryMB)
287+
288+
// Should not fit second allocation
289+
fit, dim, used, err = AllocsFit(n, []*Allocation{a3, a3}, nil, false)
290+
must.NoError(t, err)
291+
must.False(t, fit)
292+
must.Eq(t, "cpu", dim)
293+
must.Eq(t, 2000, used.Flattened.Cpu.CpuShares)
294+
must.Eq(t, []uint16{}, used.Flattened.Cpu.ReservedCores)
295+
must.Eq(t, 1024, used.Flattened.Memory.MemoryMB)
251296
}
252297

253298
func TestAllocsFit_Cores(t *testing.T) {
@@ -649,8 +694,23 @@ func TestScoreFitBinPack(t *testing.T) {
649694
Cores: []numalib.Core{{
650695
ID: 0,
651696
Grade: numalib.Performance,
652-
BaseSpeed: 4096,
697+
BaseSpeed: 2048,
698+
}, {
699+
ID: 1,
700+
Grade: numalib.Performance,
701+
BaseSpeed: 2048,
702+
}, {
703+
ID: 2,
704+
Grade: numalib.Performance,
705+
BaseSpeed: 2048,
706+
Disable: true,
707+
}, {
708+
ID: 3,
709+
Grade: numalib.Performance,
710+
BaseSpeed: 2048,
711+
Disable: true,
653712
}},
713+
OverrideWitholdCompute: 2048, // set by client reserved field
654714
},
655715
},
656716
Memory: NodeMemoryResources{
@@ -661,7 +721,8 @@ func TestScoreFitBinPack(t *testing.T) {
661721
node.NodeResources.Compatibility()
662722
node.ReservedResources = &NodeReservedResources{
663723
Cpu: NodeReservedCpuResources{
664-
CpuShares: 2048,
724+
CpuShares: 6144, // set by fingerprinting callback
725+
ReservedCpuCores: []uint16{2, 3},
665726
},
666727
Memory: NodeReservedMemoryResources{
667728
MemoryMB: 4096,

nomad/structs/numa.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,3 +172,10 @@ func (r *NodeProcessorResources) TotalCompute() int {
172172
}
173173
return int(r.Topology.TotalCompute())
174174
}
175+
176+
func (r *NodeProcessorResources) UsableCompute() int {
177+
if r == nil || r.Topology == nil {
178+
return 0
179+
}
180+
return int(r.Topology.UsableCompute())
181+
}

nomad/structs/structs.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3244,16 +3244,16 @@ func (n *NodeResources) Comparable() *ComparableResources {
32443244
return nil
32453245
}
32463246

3247-
usableCores := n.Processors.Topology.UsableCores().Slice()
3248-
reservableCores := helper.ConvertSlice(usableCores, func(id hw.CoreID) uint16 {
3247+
totalCores := n.Processors.Topology.TotalCores().Slice()
3248+
cores := helper.ConvertSlice(totalCores, func(id hw.CoreID) uint16 {
32493249
return uint16(id)
32503250
})
32513251

32523252
c := &ComparableResources{
32533253
Flattened: AllocatedTaskResources{
32543254
Cpu: AllocatedCpuResources{
32553255
CpuShares: int64(n.Processors.Topology.TotalCompute()),
3256-
ReservedCores: reservableCores,
3256+
ReservedCores: cores,
32573257
},
32583258
Memory: AllocatedMemoryResources{
32593259
MemoryMB: n.Memory.MemoryMB,

0 commit comments

Comments
 (0)