/* Copyright 2016 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package cm import ( "bufio" "fmt" "io/ioutil" "os" "path/filepath" "strconv" "strings" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/kubernetes/pkg/api/v1/resource" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm/util" ) const ( // These limits are defined in the kernel: // https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428 MinShares = 2 MaxShares = 262144 SharesPerCPU = 1024 MilliCPUToCPU = 1000 // 100000 is equivalent to 100ms QuotaPeriod = 100000 MinQuotaPeriod = 1000 ) // MilliCPUToQuota converts milliCPU to CFS quota and period values. func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) { // CFS quota is measured in two values: // - cfs_period_us=100ms (the amount of time to measure usage across given by period) // - cfs_quota=20ms (the amount of cpu time allowed to be used across a period) // so in the above example, you are limited to 20% of a single CPU // for multi-cpu environments, you just scale equivalent amounts // see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details if milliCPU == 0 { return } if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) { period = QuotaPeriod } // we then convert your milliCPU to a value normalized over a period quota = (milliCPU * period) / MilliCPUToCPU // quota needs to be a minimum of 1ms. if quota < MinQuotaPeriod { quota = MinQuotaPeriod } return } // MilliCPUToShares converts the milliCPU to CFS shares. func MilliCPUToShares(milliCPU int64) uint64 { if milliCPU == 0 { // Docker converts zero milliCPU to unset, which maps to kernel default // for unset: 1024. Return 2 here to really match kernel default for // zero milliCPU. return MinShares } // Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding. shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU if shares < MinShares { return MinShares } if shares > MaxShares { return MaxShares } return uint64(shares) } // HugePageLimits converts the API representation to a map // from huge page size (in bytes) to huge page limit (in bytes). func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { hugePageLimits := map[int64]int64{} for k, v := range resourceList { if v1helper.IsHugePageResourceName(k) { pageSize, _ := v1helper.HugePageSizeFromResourceName(k) if value, exists := hugePageLimits[pageSize.Value()]; exists { hugePageLimits[pageSize.Value()] = value + v.Value() } else { hugePageLimits[pageSize.Value()] = v.Value() } } } return hugePageLimits } // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) *ResourceConfig { // sum requests and limits. reqs, limits := resource.PodRequestsAndLimits(pod) cpuRequests := int64(0) cpuLimits := int64(0) memoryLimits := int64(0) if request, found := reqs[v1.ResourceCPU]; found { cpuRequests = request.MilliValue() } if limit, found := limits[v1.ResourceCPU]; found { cpuLimits = limit.MilliValue() } if limit, found := limits[v1.ResourceMemory]; found { memoryLimits = limit.Value() } // convert to CFS values cpuShares := MilliCPUToShares(cpuRequests) cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod)) // track if limits were applied for each resource. memoryLimitsDeclared := true cpuLimitsDeclared := true // map hugepage pagesize (bytes) to limits (bytes) hugePageLimits := map[int64]int64{} for _, container := range pod.Spec.Containers { if container.Resources.Limits.Cpu().IsZero() { cpuLimitsDeclared = false } if container.Resources.Limits.Memory().IsZero() { memoryLimitsDeclared = false } containerHugePageLimits := HugePageLimits(container.Resources.Requests) for k, v := range containerHugePageLimits { if value, exists := hugePageLimits[k]; exists { hugePageLimits[k] = value + v } else { hugePageLimits[k] = v } } } // quota is not capped when cfs quota is disabled if !enforceCPULimits { cpuQuota = int64(-1) } // determine the qos class qosClass := v1qos.GetPodQOS(pod) // build the result result := &ResourceConfig{} if qosClass == v1.PodQOSGuaranteed { result.CpuShares = &cpuShares result.CpuQuota = &cpuQuota result.CpuPeriod = &cpuPeriod result.Memory = &memoryLimits } else if qosClass == v1.PodQOSBurstable { result.CpuShares = &cpuShares if cpuLimitsDeclared { result.CpuQuota = &cpuQuota result.CpuPeriod = &cpuPeriod } if memoryLimitsDeclared { result.Memory = &memoryLimits } } else { shares := uint64(MinShares) result.CpuShares = &shares } result.HugePageLimit = hugePageLimits return result } // getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems func getCgroupSubsystemsV1() (*CgroupSubsystems, error) { // get all cgroup mounts. allCgroups, err := libcontainercgroups.GetCgroupMounts(true) if err != nil { return &CgroupSubsystems{}, err } if len(allCgroups) == 0 { return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts") } mountPoints := make(map[string]string, len(allCgroups)) for _, mount := range allCgroups { // BEFORE kubelet used a random mount point per cgroups subsystem; // NOW more deterministic: kubelet use mount point with shortest path; // FUTURE is bright with clear expectation determined in doc. // ref. issue: https://github.com/kubernetes/kubernetes/issues/95488 for _, subsystem := range mount.Subsystems { previous := mountPoints[subsystem] if previous == "" || len(mount.Mountpoint) < len(previous) { mountPoints[subsystem] = mount.Mountpoint } } } return &CgroupSubsystems{ Mounts: allCgroups, MountPoints: mountPoints, }, nil } // getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems func getCgroupSubsystemsV2() (*CgroupSubsystems, error) { content, err := ioutil.ReadFile(filepath.Join(util.CgroupRoot, "cgroup.controllers")) if err != nil { return nil, err } mounts := []libcontainercgroups.Mount{} controllers := strings.Fields(string(content)) mountPoints := make(map[string]string, len(controllers)) for _, controller := range controllers { mountPoints[controller] = util.CgroupRoot m := libcontainercgroups.Mount{ Mountpoint: util.CgroupRoot, Root: util.CgroupRoot, Subsystems: []string{controller}, } mounts = append(mounts, m) } return &CgroupSubsystems{ Mounts: mounts, MountPoints: mountPoints, }, nil } // GetCgroupSubsystems returns information about the mounted cgroup subsystems func GetCgroupSubsystems() (*CgroupSubsystems, error) { if libcontainercgroups.IsCgroup2UnifiedMode() { return getCgroupSubsystemsV2() } return getCgroupSubsystemsV1() } // getCgroupProcs takes a cgroup directory name as an argument // reads through the cgroup's procs file and returns a list of tgid's. // It returns an empty list if a procs file doesn't exists func getCgroupProcs(dir string) ([]int, error) { procsFile := filepath.Join(dir, "cgroup.procs") f, err := os.Open(procsFile) if err != nil { if os.IsNotExist(err) { // The procsFile does not exist, So no pids attached to this directory return []int{}, nil } return nil, err } defer f.Close() s := bufio.NewScanner(f) out := []int{} for s.Scan() { if t := s.Text(); t != "" { pid, err := strconv.Atoi(t) if err != nil { return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err) } out = append(out, pid) } } return out, nil } // GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier func GetPodCgroupNameSuffix(podUID types.UID) string { return podCgroupNamePrefix + string(podUID) } // NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string { nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot) if cgroupsPerQOS { nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName) } if libcontainerCgroupManagerType(cgroupDriver) == libcontainerSystemd { return nodeAllocatableRoot.ToSystemd() } return nodeAllocatableRoot.ToCgroupfs() } // GetKubeletContainer returns the cgroup the kubelet will use func GetKubeletContainer(kubeletCgroups string) (string, error) { if kubeletCgroups == "" { cont, err := getContainer(os.Getpid()) if err != nil { return "", err } return cont, nil } return kubeletCgroups, nil } // GetRuntimeContainer returns the cgroup used by the container runtime func GetRuntimeContainer(containerRuntime, runtimeCgroups string) (string, error) { if containerRuntime == "docker" { cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile) if err != nil { return "", fmt.Errorf("failed to get container name for docker process: %v", err) } return cont, nil } return runtimeCgroups, nil }