mirror of https://github.com/k3s-io/k3s
Merge pull request #66718 from ipuustin/cpu-manager-validate-offline
Automatic merge from submit-queue (batch tested with PRs 66623, 66718). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. cpumanager: validate topology in static policy **What this PR does / why we need it**: This patch adds a check for the static policy state validation. The check fails if the CPU topology obtained from cadvisor doesn't match with the current topology in the state file. If the CPU topology has changed in a node, cpumanager static policy might try to assign non-present cores to containers. For example in my test case, static policy had the default CPU set of `0-1,4-7`. Then kubelet was shut down and CPU 7 was offlined. After restarting the kubelet, CPU manager tries to assign the non-existent CPU 7 to containers which don't have exclusive allocations assigned to them: Error response from daemon: Requested CPUs are not available - requested 0-1,4-7, available: 0-6) This breaks the exclusivity, since the CPUs from the shared pool don't get assigned to non-exclusive containers, meaning that they can execute on the exclusive CPUs. **Release note**: ```release-note Added CPU Manager state validation in case of changed CPU topology. ```pull/8/head
commit
f2c6473e25
|
@ -129,7 +129,7 @@ func (p *staticPolicy) validateState(s state.State) error {
|
|||
}
|
||||
|
||||
// State has already been initialized from file (is not empty)
|
||||
// 1 Check if the reserved cpuset is not part of default cpuset because:
|
||||
// 1. Check if the reserved cpuset is not part of default cpuset because:
|
||||
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
|
||||
// - user tampered with file
|
||||
if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
|
||||
|
@ -145,6 +145,23 @@ func (p *staticPolicy) validateState(s state.State) error {
|
|||
cID, cset.String(), tmpDefaultCPUset.String())
|
||||
}
|
||||
}
|
||||
|
||||
// 3. It's possible that the set of available CPUs has changed since
|
||||
// the state was written. This can be due to for example
|
||||
// offlining a CPU when kubelet is not running. If this happens,
|
||||
// CPU manager will run into trouble when later it tries to
|
||||
// assign non-existent CPUs to containers. Validate that the
|
||||
// topology that was received during CPU manager startup matches with
|
||||
// the set of CPUs stored in the state.
|
||||
totalKnownCPUs := tmpDefaultCPUset.Clone()
|
||||
for _, cset := range tmpAssignments {
|
||||
totalKnownCPUs = totalKnownCPUs.Union(cset)
|
||||
}
|
||||
if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) {
|
||||
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
|
||||
p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String())
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -87,6 +87,26 @@ func TestStaticPolicyStart(t *testing.T) {
|
|||
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
|
||||
expPanic: true,
|
||||
},
|
||||
{
|
||||
description: "core 12 is not present in topology but is in state cpuset",
|
||||
topo: topoDualSocketHT,
|
||||
stAssignments: state.ContainerCPUAssignments{
|
||||
"0": cpuset.NewCPUSet(0, 1, 2),
|
||||
"1": cpuset.NewCPUSet(3, 4),
|
||||
},
|
||||
stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10, 11, 12),
|
||||
expPanic: true,
|
||||
},
|
||||
{
|
||||
description: "core 11 is present in topology but is not in state cpuset",
|
||||
topo: topoDualSocketHT,
|
||||
stAssignments: state.ContainerCPUAssignments{
|
||||
"0": cpuset.NewCPUSet(0, 1, 2),
|
||||
"1": cpuset.NewCPUSet(3, 4),
|
||||
},
|
||||
stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10),
|
||||
expPanic: true,
|
||||
},
|
||||
}
|
||||
for _, testCase := range testCases {
|
||||
t.Run(testCase.description, func(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue