Merge pull request #66718 from ipuustin/cpu-manager-validate-offline

Automatic merge from submit-queue (batch tested with PRs 66623, 66718). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

cpumanager: validate topology in static policy

**What this PR does / why we need it**:

This patch adds a check for the static policy state validation. The check fails if the CPU topology obtained from cadvisor doesn't match with the current topology in the state file.

If the CPU topology has changed in a node, cpumanager static policy might try to assign non-present cores to containers.

For example in my test case, static policy had the default CPU set of `0-1,4-7`. Then kubelet was shut down and CPU 7 was offlined. After restarting the kubelet, CPU manager tries to assign the non-existent CPU 7 to containers which don't have exclusive allocations assigned to them:

    Error response from daemon: Requested CPUs are not available - requested 0-1,4-7, available: 0-6)

This breaks the exclusivity, since the CPUs from the shared pool don't get assigned to non-exclusive containers, meaning that they can execute on the exclusive CPUs.

**Release note**:

```release-note
Added CPU Manager state validation in case of changed CPU topology.
```
pull/8/head
Kubernetes Submit Queue 2018-07-31 08:05:06 -07:00 committed by GitHub
commit f2c6473e25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 1 deletions

View File

@ -129,7 +129,7 @@ func (p *staticPolicy) validateState(s state.State) error {
}
// State has already been initialized from file (is not empty)
// 1 Check if the reserved cpuset is not part of default cpuset because:
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
if !p.reserved.Intersection(tmpDefaultCPUset).Equals(p.reserved) {
@ -145,6 +145,23 @@ func (p *staticPolicy) validateState(s state.State) error {
cID, cset.String(), tmpDefaultCPUset.String())
}
}
// 3. It's possible that the set of available CPUs has changed since
// the state was written. This can be due to for example
// offlining a CPU when kubelet is not running. If this happens,
// CPU manager will run into trouble when later it tries to
// assign non-existent CPUs to containers. Validate that the
// topology that was received during CPU manager startup matches with
// the set of CPUs stored in the state.
totalKnownCPUs := tmpDefaultCPUset.Clone()
for _, cset := range tmpAssignments {
totalKnownCPUs = totalKnownCPUs.Union(cset)
}
if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String())
}
return nil
}

View File

@ -87,6 +87,26 @@ func TestStaticPolicyStart(t *testing.T) {
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
expPanic: true,
},
{
description: "core 12 is not present in topology but is in state cpuset",
topo: topoDualSocketHT,
stAssignments: state.ContainerCPUAssignments{
"0": cpuset.NewCPUSet(0, 1, 2),
"1": cpuset.NewCPUSet(3, 4),
},
stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10, 11, 12),
expPanic: true,
},
{
description: "core 11 is present in topology but is not in state cpuset",
topo: topoDualSocketHT,
stAssignments: state.ContainerCPUAssignments{
"0": cpuset.NewCPUSet(0, 1, 2),
"1": cpuset.NewCPUSet(3, 4),
},
stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10),
expPanic: true,
},
}
for _, testCase := range testCases {
t.Run(testCase.description, func(t *testing.T) {