2019-01-12 04:58:27 +00:00
|
|
|
/*
|
|
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package cm
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"fmt"
|
2020-03-26 21:07:15 +00:00
|
|
|
"io/ioutil"
|
2019-01-12 04:58:27 +00:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strconv"
|
2020-03-26 21:07:15 +00:00
|
|
|
"strings"
|
2019-01-12 04:58:27 +00:00
|
|
|
|
|
|
|
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
|
|
|
|
2020-08-10 17:43:49 +00:00
|
|
|
"k8s.io/api/core/v1"
|
2019-01-12 04:58:27 +00:00
|
|
|
"k8s.io/apimachinery/pkg/types"
|
2019-08-30 18:33:25 +00:00
|
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
2019-01-12 04:58:27 +00:00
|
|
|
"k8s.io/kubernetes/pkg/api/v1/resource"
|
|
|
|
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
|
|
|
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
2019-08-30 18:33:25 +00:00
|
|
|
kubefeatures "k8s.io/kubernetes/pkg/features"
|
2020-03-26 21:07:15 +00:00
|
|
|
"k8s.io/kubernetes/pkg/kubelet/cm/util"
|
2019-01-12 04:58:27 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2020-08-10 17:43:49 +00:00
|
|
|
// These limits are defined in the kernel:
|
|
|
|
// https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428
|
|
|
|
MinShares = 2
|
|
|
|
MaxShares = 262144
|
|
|
|
|
2019-01-12 04:58:27 +00:00
|
|
|
SharesPerCPU = 1024
|
|
|
|
MilliCPUToCPU = 1000
|
|
|
|
|
|
|
|
// 100000 is equivalent to 100ms
|
|
|
|
QuotaPeriod = 100000
|
|
|
|
MinQuotaPeriod = 1000
|
|
|
|
)
|
|
|
|
|
|
|
|
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
|
|
|
|
func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) {
|
|
|
|
// CFS quota is measured in two values:
|
|
|
|
// - cfs_period_us=100ms (the amount of time to measure usage across given by period)
|
|
|
|
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
|
|
|
|
// so in the above example, you are limited to 20% of a single CPU
|
|
|
|
// for multi-cpu environments, you just scale equivalent amounts
|
|
|
|
// see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details
|
|
|
|
|
|
|
|
if milliCPU == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2019-08-30 18:33:25 +00:00
|
|
|
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
|
|
|
|
period = QuotaPeriod
|
|
|
|
}
|
2019-01-12 04:58:27 +00:00
|
|
|
|
|
|
|
// we then convert your milliCPU to a value normalized over a period
|
|
|
|
quota = (milliCPU * period) / MilliCPUToCPU
|
|
|
|
|
|
|
|
// quota needs to be a minimum of 1ms.
|
|
|
|
if quota < MinQuotaPeriod {
|
|
|
|
quota = MinQuotaPeriod
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// MilliCPUToShares converts the milliCPU to CFS shares.
|
|
|
|
func MilliCPUToShares(milliCPU int64) uint64 {
|
|
|
|
if milliCPU == 0 {
|
|
|
|
// Docker converts zero milliCPU to unset, which maps to kernel default
|
|
|
|
// for unset: 1024. Return 2 here to really match kernel default for
|
|
|
|
// zero milliCPU.
|
|
|
|
return MinShares
|
|
|
|
}
|
|
|
|
// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
|
|
|
|
shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
|
|
|
|
if shares < MinShares {
|
|
|
|
return MinShares
|
|
|
|
}
|
2020-08-10 17:43:49 +00:00
|
|
|
if shares > MaxShares {
|
|
|
|
return MaxShares
|
|
|
|
}
|
2019-01-12 04:58:27 +00:00
|
|
|
return uint64(shares)
|
|
|
|
}
|
|
|
|
|
|
|
|
// HugePageLimits converts the API representation to a map
|
|
|
|
// from huge page size (in bytes) to huge page limit (in bytes).
|
|
|
|
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
|
|
|
|
hugePageLimits := map[int64]int64{}
|
|
|
|
for k, v := range resourceList {
|
|
|
|
if v1helper.IsHugePageResourceName(k) {
|
|
|
|
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
|
|
|
|
if value, exists := hugePageLimits[pageSize.Value()]; exists {
|
|
|
|
hugePageLimits[pageSize.Value()] = value + v.Value()
|
|
|
|
} else {
|
|
|
|
hugePageLimits[pageSize.Value()] = v.Value()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return hugePageLimits
|
|
|
|
}
|
|
|
|
|
|
|
|
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
|
|
|
|
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) *ResourceConfig {
|
|
|
|
// sum requests and limits.
|
|
|
|
reqs, limits := resource.PodRequestsAndLimits(pod)
|
|
|
|
|
|
|
|
cpuRequests := int64(0)
|
|
|
|
cpuLimits := int64(0)
|
|
|
|
memoryLimits := int64(0)
|
|
|
|
if request, found := reqs[v1.ResourceCPU]; found {
|
|
|
|
cpuRequests = request.MilliValue()
|
|
|
|
}
|
|
|
|
if limit, found := limits[v1.ResourceCPU]; found {
|
|
|
|
cpuLimits = limit.MilliValue()
|
|
|
|
}
|
|
|
|
if limit, found := limits[v1.ResourceMemory]; found {
|
|
|
|
memoryLimits = limit.Value()
|
|
|
|
}
|
|
|
|
|
|
|
|
// convert to CFS values
|
|
|
|
cpuShares := MilliCPUToShares(cpuRequests)
|
|
|
|
cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod))
|
|
|
|
|
|
|
|
// track if limits were applied for each resource.
|
|
|
|
memoryLimitsDeclared := true
|
|
|
|
cpuLimitsDeclared := true
|
|
|
|
// map hugepage pagesize (bytes) to limits (bytes)
|
|
|
|
hugePageLimits := map[int64]int64{}
|
|
|
|
for _, container := range pod.Spec.Containers {
|
|
|
|
if container.Resources.Limits.Cpu().IsZero() {
|
|
|
|
cpuLimitsDeclared = false
|
|
|
|
}
|
|
|
|
if container.Resources.Limits.Memory().IsZero() {
|
|
|
|
memoryLimitsDeclared = false
|
|
|
|
}
|
|
|
|
containerHugePageLimits := HugePageLimits(container.Resources.Requests)
|
|
|
|
for k, v := range containerHugePageLimits {
|
|
|
|
if value, exists := hugePageLimits[k]; exists {
|
|
|
|
hugePageLimits[k] = value + v
|
|
|
|
} else {
|
|
|
|
hugePageLimits[k] = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// quota is not capped when cfs quota is disabled
|
|
|
|
if !enforceCPULimits {
|
|
|
|
cpuQuota = int64(-1)
|
|
|
|
}
|
|
|
|
|
|
|
|
// determine the qos class
|
|
|
|
qosClass := v1qos.GetPodQOS(pod)
|
|
|
|
|
|
|
|
// build the result
|
|
|
|
result := &ResourceConfig{}
|
|
|
|
if qosClass == v1.PodQOSGuaranteed {
|
|
|
|
result.CpuShares = &cpuShares
|
|
|
|
result.CpuQuota = &cpuQuota
|
|
|
|
result.CpuPeriod = &cpuPeriod
|
|
|
|
result.Memory = &memoryLimits
|
|
|
|
} else if qosClass == v1.PodQOSBurstable {
|
|
|
|
result.CpuShares = &cpuShares
|
|
|
|
if cpuLimitsDeclared {
|
|
|
|
result.CpuQuota = &cpuQuota
|
|
|
|
result.CpuPeriod = &cpuPeriod
|
|
|
|
}
|
|
|
|
if memoryLimitsDeclared {
|
|
|
|
result.Memory = &memoryLimits
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
shares := uint64(MinShares)
|
|
|
|
result.CpuShares = &shares
|
|
|
|
}
|
|
|
|
result.HugePageLimit = hugePageLimits
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2020-03-26 21:07:15 +00:00
|
|
|
// getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems
|
|
|
|
func getCgroupSubsystemsV1() (*CgroupSubsystems, error) {
|
2019-01-12 04:58:27 +00:00
|
|
|
// get all cgroup mounts.
|
|
|
|
allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
|
|
|
|
if err != nil {
|
|
|
|
return &CgroupSubsystems{}, err
|
|
|
|
}
|
|
|
|
if len(allCgroups) == 0 {
|
|
|
|
return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
|
|
|
|
}
|
|
|
|
mountPoints := make(map[string]string, len(allCgroups))
|
|
|
|
for _, mount := range allCgroups {
|
2021-03-18 22:40:29 +00:00
|
|
|
// BEFORE kubelet used a random mount point per cgroups subsystem;
|
|
|
|
// NOW more deterministic: kubelet use mount point with shortest path;
|
|
|
|
// FUTURE is bright with clear expectation determined in doc.
|
|
|
|
// ref. issue: https://github.com/kubernetes/kubernetes/issues/95488
|
|
|
|
|
2019-01-12 04:58:27 +00:00
|
|
|
for _, subsystem := range mount.Subsystems {
|
2021-03-18 22:40:29 +00:00
|
|
|
previous := mountPoints[subsystem]
|
|
|
|
if previous == "" || len(mount.Mountpoint) < len(previous) {
|
|
|
|
mountPoints[subsystem] = mount.Mountpoint
|
|
|
|
}
|
2019-01-12 04:58:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return &CgroupSubsystems{
|
|
|
|
Mounts: allCgroups,
|
|
|
|
MountPoints: mountPoints,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
2020-03-26 21:07:15 +00:00
|
|
|
// getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems
|
|
|
|
func getCgroupSubsystemsV2() (*CgroupSubsystems, error) {
|
|
|
|
content, err := ioutil.ReadFile(filepath.Join(util.CgroupRoot, "cgroup.controllers"))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
mounts := []libcontainercgroups.Mount{}
|
|
|
|
controllers := strings.Fields(string(content))
|
|
|
|
mountPoints := make(map[string]string, len(controllers))
|
|
|
|
for _, controller := range controllers {
|
|
|
|
mountPoints[controller] = util.CgroupRoot
|
|
|
|
m := libcontainercgroups.Mount{
|
|
|
|
Mountpoint: util.CgroupRoot,
|
|
|
|
Root: util.CgroupRoot,
|
|
|
|
Subsystems: []string{controller},
|
|
|
|
}
|
|
|
|
mounts = append(mounts, m)
|
|
|
|
}
|
|
|
|
|
|
|
|
return &CgroupSubsystems{
|
|
|
|
Mounts: mounts,
|
|
|
|
MountPoints: mountPoints,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
|
|
|
|
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
|
|
|
|
if libcontainercgroups.IsCgroup2UnifiedMode() {
|
|
|
|
return getCgroupSubsystemsV2()
|
|
|
|
}
|
|
|
|
|
|
|
|
return getCgroupSubsystemsV1()
|
|
|
|
}
|
|
|
|
|
2019-01-12 04:58:27 +00:00
|
|
|
// getCgroupProcs takes a cgroup directory name as an argument
|
|
|
|
// reads through the cgroup's procs file and returns a list of tgid's.
|
|
|
|
// It returns an empty list if a procs file doesn't exists
|
|
|
|
func getCgroupProcs(dir string) ([]int, error) {
|
|
|
|
procsFile := filepath.Join(dir, "cgroup.procs")
|
|
|
|
f, err := os.Open(procsFile)
|
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
// The procsFile does not exist, So no pids attached to this directory
|
|
|
|
return []int{}, nil
|
|
|
|
}
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
s := bufio.NewScanner(f)
|
|
|
|
out := []int{}
|
|
|
|
for s.Scan() {
|
|
|
|
if t := s.Text(); t != "" {
|
|
|
|
pid, err := strconv.Atoi(t)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
|
|
|
|
}
|
|
|
|
out = append(out, pid)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
|
|
|
|
func GetPodCgroupNameSuffix(podUID types.UID) string {
|
|
|
|
return podCgroupNamePrefix + string(podUID)
|
|
|
|
}
|
2019-08-30 18:33:25 +00:00
|
|
|
|
|
|
|
// NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
|
2020-08-10 17:43:49 +00:00
|
|
|
func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
|
|
|
|
nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot)
|
|
|
|
if cgroupsPerQOS {
|
|
|
|
nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName)
|
|
|
|
}
|
|
|
|
if libcontainerCgroupManagerType(cgroupDriver) == libcontainerSystemd {
|
|
|
|
return nodeAllocatableRoot.ToSystemd()
|
|
|
|
}
|
2019-08-30 18:33:25 +00:00
|
|
|
return nodeAllocatableRoot.ToCgroupfs()
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetKubeletContainer returns the cgroup the kubelet will use
|
|
|
|
func GetKubeletContainer(kubeletCgroups string) (string, error) {
|
|
|
|
if kubeletCgroups == "" {
|
|
|
|
cont, err := getContainer(os.Getpid())
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
return cont, nil
|
|
|
|
}
|
|
|
|
return kubeletCgroups, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetRuntimeContainer returns the cgroup used by the container runtime
|
|
|
|
func GetRuntimeContainer(containerRuntime, runtimeCgroups string) (string, error) {
|
|
|
|
if containerRuntime == "docker" {
|
|
|
|
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("failed to get container name for docker process: %v", err)
|
|
|
|
}
|
|
|
|
return cont, nil
|
|
|
|
}
|
|
|
|
return runtimeCgroups, nil
|
|
|
|
}
|