2016-06-21 05:03:59 +00:00
|
|
|
/*
|
2016-06-03 00:25:58 +00:00
|
|
|
Copyright 2016 The Kubernetes Authors.
|
2016-06-21 05:03:59 +00:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package cm
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2016-10-17 17:23:48 +00:00
|
|
|
"os"
|
2016-07-06 22:08:01 +00:00
|
|
|
"path"
|
2016-10-17 17:23:48 +00:00
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
2017-02-27 21:13:31 +00:00
|
|
|
"time"
|
2016-06-21 05:03:59 +00:00
|
|
|
|
2017-08-17 18:28:15 +00:00
|
|
|
units "github.com/docker/go-units"
|
2016-10-17 17:23:48 +00:00
|
|
|
"github.com/golang/glog"
|
2016-07-06 22:08:01 +00:00
|
|
|
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
|
|
|
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
|
2016-10-17 17:23:48 +00:00
|
|
|
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
2016-06-21 05:03:59 +00:00
|
|
|
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
|
2017-08-17 18:28:15 +00:00
|
|
|
|
2017-01-11 14:09:48 +00:00
|
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
2017-08-17 18:28:15 +00:00
|
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
|
|
kubefeatures "k8s.io/kubernetes/pkg/features"
|
2017-02-27 21:13:31 +00:00
|
|
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
2016-06-21 05:03:59 +00:00
|
|
|
)
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// libcontainerCgroupManagerType defines how to interface with libcontainer
|
|
|
|
type libcontainerCgroupManagerType string
|
|
|
|
|
|
|
|
const (
|
|
|
|
// libcontainerCgroupfs means use libcontainer with cgroupfs
|
|
|
|
libcontainerCgroupfs libcontainerCgroupManagerType = "cgroupfs"
|
|
|
|
// libcontainerSystemd means use libcontainer with systemd
|
|
|
|
libcontainerSystemd libcontainerCgroupManagerType = "systemd"
|
2017-11-17 20:04:58 +00:00
|
|
|
// systemdSuffix is the cgroup name suffix for systemd
|
|
|
|
systemdSuffix string = ".slice"
|
2016-10-17 17:23:48 +00:00
|
|
|
)
|
|
|
|
|
2017-08-17 18:28:15 +00:00
|
|
|
// hugePageSizeList is useful for converting to the hugetlb canonical unit
|
|
|
|
// which is what is expected when interacting with libcontainer
|
2017-08-30 18:23:26 +00:00
|
|
|
var hugePageSizeList = []string{"B", "kB", "MB", "GB", "TB", "PB"}
|
2017-08-17 18:28:15 +00:00
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
|
|
|
|
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
|
|
|
|
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
|
|
|
|
// For example, it will return /Burstable.slice/Burstable-pod_123_456.slice in above scenario.
|
|
|
|
func ConvertCgroupNameToSystemd(cgroupName CgroupName, outputToCgroupFs bool) string {
|
|
|
|
name := string(cgroupName)
|
|
|
|
result := ""
|
|
|
|
if name != "" && name != "/" {
|
|
|
|
parts := strings.Split(name, "/")
|
2017-02-20 17:03:58 +00:00
|
|
|
results := []string{}
|
2016-10-17 17:23:48 +00:00
|
|
|
for _, part := range parts {
|
2017-02-20 17:03:58 +00:00
|
|
|
// ignore leading stuff
|
2016-10-17 17:23:48 +00:00
|
|
|
if part == "" {
|
|
|
|
continue
|
|
|
|
}
|
2017-02-20 17:03:58 +00:00
|
|
|
// detect if we are given a systemd style name.
|
|
|
|
// if so, we do not want to do double encoding.
|
2017-11-17 20:04:58 +00:00
|
|
|
if IsSystemdStyleName(part) {
|
|
|
|
part = strings.TrimSuffix(part, systemdSuffix)
|
2017-02-20 17:03:58 +00:00
|
|
|
separatorIndex := strings.LastIndex(part, "-")
|
|
|
|
if separatorIndex >= 0 && separatorIndex < len(part) {
|
|
|
|
part = part[separatorIndex+1:]
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// systemd treats - as a step in the hierarchy, we convert all - to _
|
|
|
|
part = strings.Replace(part, "-", "_", -1)
|
2016-10-17 17:23:48 +00:00
|
|
|
}
|
2017-02-20 17:03:58 +00:00
|
|
|
results = append(results, part)
|
2016-10-17 17:23:48 +00:00
|
|
|
}
|
2017-02-20 17:03:58 +00:00
|
|
|
// each part is appended with systemd style -
|
|
|
|
result = strings.Join(results, "-")
|
2016-10-17 17:23:48 +00:00
|
|
|
} else {
|
|
|
|
// root converts to -
|
|
|
|
result = "-"
|
|
|
|
}
|
|
|
|
// always have a .slice suffix
|
2017-11-17 20:04:58 +00:00
|
|
|
if !IsSystemdStyleName(result) {
|
|
|
|
result = result + systemdSuffix
|
2017-02-20 17:03:58 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
|
|
|
// if the caller desired the result in cgroupfs format...
|
|
|
|
if outputToCgroupFs {
|
|
|
|
var err error
|
|
|
|
result, err = cgroupsystemd.ExpandSlice(result)
|
|
|
|
if err != nil {
|
|
|
|
panic(fmt.Errorf("error adapting cgroup name, input: %v, err: %v", name, err))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
// ConvertCgroupFsNameToSystemd converts an expanded cgroupfs name to its systemd name.
|
|
|
|
// For example, it will convert test.slice/test-a.slice/test-a-b.slice to become test-a-b.slice
|
|
|
|
// NOTE: this is public right now to allow its usage in dockermanager and dockershim, ideally both those
|
|
|
|
// code areas could use something from libcontainer if we get this style function upstream.
|
|
|
|
func ConvertCgroupFsNameToSystemd(cgroupfsName string) (string, error) {
|
|
|
|
// TODO: see if libcontainer systemd implementation could use something similar, and if so, move
|
|
|
|
// this function up to that library. At that time, it would most likely do validation specific to systemd
|
|
|
|
// above and beyond the simple assumption here that the base of the path encodes the hierarchy
|
|
|
|
// per systemd convention.
|
|
|
|
return path.Base(cgroupfsName), nil
|
|
|
|
}
|
|
|
|
|
2017-11-17 20:04:58 +00:00
|
|
|
func IsSystemdStyleName(name string) bool {
|
|
|
|
if strings.HasSuffix(name, systemdSuffix) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// libcontainerAdapter provides a simplified interface to libcontainer based on libcontainer type.
|
|
|
|
type libcontainerAdapter struct {
|
|
|
|
// cgroupManagerType defines how to interface with libcontainer
|
|
|
|
cgroupManagerType libcontainerCgroupManagerType
|
|
|
|
}
|
|
|
|
|
|
|
|
// newLibcontainerAdapter returns a configured libcontainerAdapter for specified manager.
|
|
|
|
// it does any initialization required by that manager to function.
|
|
|
|
func newLibcontainerAdapter(cgroupManagerType libcontainerCgroupManagerType) *libcontainerAdapter {
|
|
|
|
return &libcontainerAdapter{cgroupManagerType: cgroupManagerType}
|
|
|
|
}
|
|
|
|
|
|
|
|
// newManager returns an implementation of cgroups.Manager
|
|
|
|
func (l *libcontainerAdapter) newManager(cgroups *libcontainerconfigs.Cgroup, paths map[string]string) (libcontainercgroups.Manager, error) {
|
|
|
|
switch l.cgroupManagerType {
|
|
|
|
case libcontainerCgroupfs:
|
|
|
|
return &cgroupfs.Manager{
|
|
|
|
Cgroups: cgroups,
|
|
|
|
Paths: paths,
|
|
|
|
}, nil
|
|
|
|
case libcontainerSystemd:
|
|
|
|
// this means you asked systemd to manage cgroups, but systemd was not on the host, so all you can do is panic...
|
|
|
|
if !cgroupsystemd.UseSystemd() {
|
|
|
|
panic("systemd cgroup manager not available")
|
|
|
|
}
|
|
|
|
return &cgroupsystemd.Manager{
|
|
|
|
Cgroups: cgroups,
|
|
|
|
Paths: paths,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
return nil, fmt.Errorf("invalid cgroup manager configuration")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *libcontainerAdapter) revertName(name string) CgroupName {
|
|
|
|
if l.cgroupManagerType != libcontainerSystemd {
|
|
|
|
return CgroupName(name)
|
|
|
|
}
|
2017-11-17 20:04:58 +00:00
|
|
|
return CgroupName(RevertFromSystemdToCgroupStyleName(name))
|
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
2017-11-17 20:04:58 +00:00
|
|
|
func RevertFromSystemdToCgroupStyleName(name string) string {
|
2016-10-17 17:23:48 +00:00
|
|
|
driverName, err := ConvertCgroupFsNameToSystemd(name)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
2017-11-17 20:04:58 +00:00
|
|
|
driverName = strings.TrimSuffix(driverName, systemdSuffix)
|
2017-02-25 03:41:15 +00:00
|
|
|
driverName = strings.Replace(driverName, "-", "/", -1)
|
2016-10-17 17:23:48 +00:00
|
|
|
driverName = strings.Replace(driverName, "_", "-", -1)
|
2017-11-17 20:04:58 +00:00
|
|
|
return driverName
|
2016-10-17 17:23:48 +00:00
|
|
|
}
|
|
|
|
|
2017-07-27 13:31:31 +00:00
|
|
|
// adaptName converts a CgroupName identifier to a driver specific conversion value.
|
2016-10-17 17:23:48 +00:00
|
|
|
// if outputToCgroupFs is true, the result is returned in the cgroupfs format rather than the driver specific form.
|
|
|
|
func (l *libcontainerAdapter) adaptName(cgroupName CgroupName, outputToCgroupFs bool) string {
|
|
|
|
if l.cgroupManagerType != libcontainerSystemd {
|
|
|
|
name := string(cgroupName)
|
|
|
|
return name
|
|
|
|
}
|
|
|
|
return ConvertCgroupNameToSystemd(cgroupName, outputToCgroupFs)
|
|
|
|
}
|
|
|
|
|
2017-05-26 02:53:09 +00:00
|
|
|
// CgroupSubsystems holds information about the mounted cgroup subsystems
|
2016-07-13 04:39:22 +00:00
|
|
|
type CgroupSubsystems struct {
|
|
|
|
// Cgroup subsystem mounts.
|
|
|
|
// e.g.: "/sys/fs/cgroup/cpu" -> ["cpu", "cpuacct"]
|
|
|
|
Mounts []libcontainercgroups.Mount
|
|
|
|
|
|
|
|
// Cgroup subsystem to their mount location.
|
|
|
|
// e.g.: "cpu" -> "/sys/fs/cgroup/cpu"
|
|
|
|
MountPoints map[string]string
|
|
|
|
}
|
|
|
|
|
2016-06-21 05:03:59 +00:00
|
|
|
// cgroupManagerImpl implements the CgroupManager interface.
|
|
|
|
// Its a stateless object which can be used to
|
|
|
|
// update,create or delete any number of cgroups
|
|
|
|
// It uses the Libcontainer raw fs cgroup manager for cgroup management.
|
|
|
|
type cgroupManagerImpl struct {
|
|
|
|
// subsystems holds information about all the
|
2017-05-26 02:53:09 +00:00
|
|
|
// mounted cgroup subsystems on the node
|
2016-07-13 04:39:22 +00:00
|
|
|
subsystems *CgroupSubsystems
|
2016-10-17 17:23:48 +00:00
|
|
|
// simplifies interaction with libcontainer and its cgroup managers
|
|
|
|
adapter *libcontainerAdapter
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure that cgroupManagerImpl implements the CgroupManager interface
|
|
|
|
var _ CgroupManager = &cgroupManagerImpl{}
|
|
|
|
|
|
|
|
// NewCgroupManager is a factory method that returns a CgroupManager
|
2016-10-17 17:23:48 +00:00
|
|
|
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
|
|
|
|
managerType := libcontainerCgroupfs
|
|
|
|
if cgroupDriver == string(libcontainerSystemd) {
|
|
|
|
managerType = libcontainerSystemd
|
|
|
|
}
|
2016-06-21 05:03:59 +00:00
|
|
|
return &cgroupManagerImpl{
|
|
|
|
subsystems: cs,
|
2016-10-17 17:23:48 +00:00
|
|
|
adapter: newLibcontainerAdapter(managerType),
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// Name converts the cgroup to the driver specific value in cgroupfs form.
|
|
|
|
func (m *cgroupManagerImpl) Name(name CgroupName) string {
|
|
|
|
return m.adapter.adaptName(name, true)
|
|
|
|
}
|
|
|
|
|
|
|
|
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
|
|
|
|
func (m *cgroupManagerImpl) CgroupName(name string) CgroupName {
|
|
|
|
return m.adapter.revertName(name)
|
|
|
|
}
|
|
|
|
|
|
|
|
// buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
|
|
|
|
func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string {
|
|
|
|
cgroupFsAdaptedName := m.Name(name)
|
2016-07-13 04:39:22 +00:00
|
|
|
cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
|
|
|
|
for key, val := range m.subsystems.MountPoints {
|
2016-10-17 17:23:48 +00:00
|
|
|
cgroupPaths[key] = path.Join(val, cgroupFsAdaptedName)
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
return cgroupPaths
|
|
|
|
}
|
|
|
|
|
|
|
|
// Exists checks if all subsystem cgroups already exist
|
|
|
|
func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
|
|
|
|
// Get map of all cgroup paths on the system for the particular cgroup
|
|
|
|
cgroupPaths := m.buildCgroupPaths(name)
|
2016-07-06 22:08:01 +00:00
|
|
|
|
2017-05-08 21:51:03 +00:00
|
|
|
// the presence of alternative control groups not known to runc confuses
|
|
|
|
// the kubelet existence checks.
|
2017-07-27 13:31:31 +00:00
|
|
|
// ideally, we would have a mechanism in runc to support Exists() logic
|
2017-05-08 21:51:03 +00:00
|
|
|
// scoped to the set control groups it understands. this is being discussed
|
|
|
|
// in https://github.com/opencontainers/runc/issues/1440
|
|
|
|
// once resolved, we can remove this code.
|
2017-08-16 14:52:56 +00:00
|
|
|
whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")
|
2017-05-08 21:51:03 +00:00
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// If even one cgroup path doesn't exist, then the cgroup doesn't exist.
|
2017-05-08 21:51:03 +00:00
|
|
|
for controller, path := range cgroupPaths {
|
2017-07-27 13:31:31 +00:00
|
|
|
// ignore mounts we don't care about
|
2017-05-08 21:51:03 +00:00
|
|
|
if !whitelistControllers.Has(controller) {
|
|
|
|
continue
|
|
|
|
}
|
2016-07-11 03:41:53 +00:00
|
|
|
if !libcontainercgroups.PathExists(path) {
|
2016-07-06 22:08:01 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
2016-07-06 22:08:01 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2016-06-21 05:03:59 +00:00
|
|
|
// Destroy destroys the specified cgroup
|
|
|
|
func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error {
|
2017-02-27 21:13:31 +00:00
|
|
|
start := time.Now()
|
|
|
|
defer func() {
|
|
|
|
metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start))
|
|
|
|
}()
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
|
2016-06-21 05:03:59 +00:00
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// we take the location in traditional cgroupfs format.
|
|
|
|
abstractCgroupFsName := string(cgroupConfig.Name)
|
|
|
|
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
|
|
|
|
abstractName := CgroupName(path.Base(abstractCgroupFsName))
|
|
|
|
|
|
|
|
driverParent := m.adapter.adaptName(abstractParent, false)
|
|
|
|
driverName := m.adapter.adaptName(abstractName, false)
|
|
|
|
|
|
|
|
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
|
|
|
|
if m.adapter.cgroupManagerType == libcontainerSystemd {
|
|
|
|
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
2016-06-21 05:03:59 +00:00
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// Initialize libcontainer's cgroup config with driver specific naming.
|
2016-07-06 22:08:01 +00:00
|
|
|
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
|
2016-10-17 17:23:48 +00:00
|
|
|
Name: driverName,
|
|
|
|
Parent: driverParent,
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
|
|
|
manager, err := m.adapter.newManager(libcontainerCgroupConfig, cgroupPaths)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-07-06 22:08:01 +00:00
|
|
|
|
2016-06-21 05:03:59 +00:00
|
|
|
// Delete cgroups using libcontainers Managers Destroy() method
|
2016-10-17 17:23:48 +00:00
|
|
|
if err = manager.Destroy(); err != nil {
|
|
|
|
return fmt.Errorf("Unable to destroy cgroup paths for cgroup %v : %v", cgroupConfig.Name, err)
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
2016-06-21 05:03:59 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-07-06 22:08:01 +00:00
|
|
|
type subsystem interface {
|
|
|
|
// Name returns the name of the subsystem.
|
|
|
|
Name() string
|
|
|
|
// Set the cgroup represented by cgroup.
|
|
|
|
Set(path string, cgroup *libcontainerconfigs.Cgroup) error
|
2017-02-28 21:03:06 +00:00
|
|
|
// GetStats returns the statistics associated with the cgroup
|
|
|
|
GetStats(path string, stats *libcontainercgroups.Stats) error
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
|
|
|
|
2018-01-31 23:09:03 +00:00
|
|
|
// getSupportedSubsystems returns a map of subsystem and if it must be mounted for the kubelet to function.
|
|
|
|
func getSupportedSubsystems() map[subsystem]bool {
|
|
|
|
supportedSubsystems := map[subsystem]bool{
|
|
|
|
&cgroupfs.MemoryGroup{}: true,
|
|
|
|
&cgroupfs.CpuGroup{}: true,
|
2017-08-17 18:28:15 +00:00
|
|
|
}
|
2018-01-31 23:09:03 +00:00
|
|
|
// not all hosts support hugetlb cgroup, and in the absent of hugetlb, we will fail silently by reporting no capacity.
|
2017-08-17 18:28:15 +00:00
|
|
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
2018-01-31 23:09:03 +00:00
|
|
|
supportedSubsystems[&cgroupfs.HugetlbGroup{}] = false
|
2017-08-17 18:28:15 +00:00
|
|
|
}
|
2018-01-08 17:32:34 +00:00
|
|
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
|
2018-01-31 23:09:03 +00:00
|
|
|
supportedSubsystems[&cgroupfs.PidsGroup{}] = true
|
2018-01-08 17:32:34 +00:00
|
|
|
}
|
2017-08-17 18:28:15 +00:00
|
|
|
return supportedSubsystems
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
|
|
|
|
2017-05-26 02:53:09 +00:00
|
|
|
// setSupportedSubsystems sets cgroup resource limits only on the supported
|
|
|
|
// subsystems. ie. cpu and memory. We don't use libcontainer's cgroup/fs/Set()
|
2016-08-02 22:13:54 +00:00
|
|
|
// method as it doesn't allow us to skip updates on the devices cgroup
|
2016-07-06 22:08:01 +00:00
|
|
|
// Allowing or denying all devices by writing 'a' to devices.allow or devices.deny is
|
|
|
|
// not possible once the device cgroups has children. Once the pod level cgroup are
|
|
|
|
// created under the QOS level cgroup we cannot update the QOS level device cgroup.
|
|
|
|
// We would like to skip setting any values on the device cgroup in this case
|
|
|
|
// but this is not possible with libcontainers Set() method
|
|
|
|
// See https://github.com/opencontainers/runc/issues/932
|
2017-05-26 02:53:09 +00:00
|
|
|
func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
|
2018-01-31 23:09:03 +00:00
|
|
|
for sys, required := range getSupportedSubsystems() {
|
2016-07-06 22:08:01 +00:00
|
|
|
if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
|
2018-01-31 23:09:03 +00:00
|
|
|
if required {
|
|
|
|
return fmt.Errorf("Failed to find subsystem mount for required subsystem: %v", sys.Name())
|
|
|
|
}
|
|
|
|
// the cgroup is not mounted, but its not required so continue...
|
|
|
|
glog.V(6).Infof("Unable to find subsystem mount for optional subsystem: %v", sys.Name())
|
|
|
|
continue
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
|
|
|
if err := sys.Set(cgroupConfig.Paths[sys.Name()], cgroupConfig); err != nil {
|
|
|
|
return fmt.Errorf("Failed to set config for supported subsystems : %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
|
2016-07-06 22:08:01 +00:00
|
|
|
resources := &libcontainerconfigs.Resources{}
|
2016-10-17 17:23:48 +00:00
|
|
|
if resourceConfig == nil {
|
|
|
|
return resources
|
|
|
|
}
|
2016-07-06 22:08:01 +00:00
|
|
|
if resourceConfig.Memory != nil {
|
|
|
|
resources.Memory = *resourceConfig.Memory
|
|
|
|
}
|
|
|
|
if resourceConfig.CpuShares != nil {
|
|
|
|
resources.CpuShares = *resourceConfig.CpuShares
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-07-06 22:08:01 +00:00
|
|
|
if resourceConfig.CpuQuota != nil {
|
|
|
|
resources.CpuQuota = *resourceConfig.CpuQuota
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
if resourceConfig.CpuPeriod != nil {
|
|
|
|
resources.CpuPeriod = *resourceConfig.CpuPeriod
|
|
|
|
}
|
2017-08-17 18:28:15 +00:00
|
|
|
|
|
|
|
// if huge pages are enabled, we set them in libcontainer
|
|
|
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
|
|
|
// for each page size enumerated, set that value
|
|
|
|
pageSizes := sets.NewString()
|
|
|
|
for pageSize, limit := range resourceConfig.HugePageLimit {
|
|
|
|
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList)
|
|
|
|
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
|
|
|
|
Pagesize: sizeString,
|
|
|
|
Limit: uint64(limit),
|
|
|
|
})
|
|
|
|
pageSizes.Insert(sizeString)
|
|
|
|
}
|
|
|
|
// for each page size omitted, limit to 0
|
|
|
|
for _, pageSize := range cgroupfs.HugePageSizes {
|
|
|
|
if pageSizes.Has(pageSize) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
|
|
|
|
Pagesize: pageSize,
|
|
|
|
Limit: uint64(0),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
return resources
|
|
|
|
}
|
2016-06-21 05:03:59 +00:00
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// Update updates the cgroup with the specified Cgroup Configuration
|
|
|
|
func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
|
2017-02-27 21:13:31 +00:00
|
|
|
start := time.Now()
|
|
|
|
defer func() {
|
|
|
|
metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start))
|
|
|
|
}()
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// Extract the cgroup resource parameters
|
|
|
|
resourceConfig := cgroupConfig.ResourceParameters
|
|
|
|
resources := m.toResources(resourceConfig)
|
|
|
|
|
|
|
|
cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
|
|
|
|
|
|
|
|
// we take the location in traditional cgroupfs format.
|
|
|
|
abstractCgroupFsName := string(cgroupConfig.Name)
|
|
|
|
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
|
|
|
|
abstractName := CgroupName(path.Base(abstractCgroupFsName))
|
|
|
|
|
|
|
|
driverParent := m.adapter.adaptName(abstractParent, false)
|
|
|
|
driverName := m.adapter.adaptName(abstractName, false)
|
|
|
|
|
|
|
|
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
|
|
|
|
if m.adapter.cgroupManagerType == libcontainerSystemd {
|
|
|
|
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
|
2016-07-13 04:39:22 +00:00
|
|
|
}
|
|
|
|
|
2016-07-06 22:08:01 +00:00
|
|
|
// Initialize libcontainer's cgroup config
|
|
|
|
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
|
2016-10-17 17:23:48 +00:00
|
|
|
Name: driverName,
|
|
|
|
Parent: driverParent,
|
2016-07-06 22:08:01 +00:00
|
|
|
Resources: resources,
|
2016-07-13 04:39:22 +00:00
|
|
|
Paths: cgroupPaths,
|
2016-07-06 22:08:01 +00:00
|
|
|
}
|
|
|
|
|
2018-01-08 17:32:34 +00:00
|
|
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
|
|
|
|
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
|
|
|
|
}
|
|
|
|
|
2017-05-26 02:53:09 +00:00
|
|
|
if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil {
|
2016-10-17 17:23:48 +00:00
|
|
|
return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err)
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create creates the specified cgroup
|
|
|
|
func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
|
2017-02-27 21:13:31 +00:00
|
|
|
start := time.Now()
|
|
|
|
defer func() {
|
|
|
|
metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start))
|
|
|
|
}()
|
2016-06-21 05:03:59 +00:00
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// we take the location in traditional cgroupfs format.
|
|
|
|
abstractCgroupFsName := string(cgroupConfig.Name)
|
|
|
|
abstractParent := CgroupName(path.Dir(abstractCgroupFsName))
|
|
|
|
abstractName := CgroupName(path.Base(abstractCgroupFsName))
|
|
|
|
|
|
|
|
driverParent := m.adapter.adaptName(abstractParent, false)
|
|
|
|
driverName := m.adapter.adaptName(abstractName, false)
|
|
|
|
// this is an ugly abstraction bleed, but systemd cgroup driver requires full paths...
|
|
|
|
if m.adapter.cgroupManagerType == libcontainerSystemd {
|
|
|
|
driverName = m.adapter.adaptName(cgroupConfig.Name, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
resources := m.toResources(cgroupConfig.ResourceParameters)
|
|
|
|
// Initialize libcontainer's cgroup config with driver specific naming.
|
2016-07-06 22:08:01 +00:00
|
|
|
libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
|
2016-10-17 17:23:48 +00:00
|
|
|
Name: driverName,
|
|
|
|
Parent: driverParent,
|
|
|
|
Resources: resources,
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-07-06 22:08:01 +00:00
|
|
|
|
2018-01-08 17:32:34 +00:00
|
|
|
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
|
|
|
|
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
|
|
|
|
}
|
|
|
|
|
2016-10-17 17:23:48 +00:00
|
|
|
// get the manager with the specified cgroup configuration
|
|
|
|
manager, err := m.adapter.newManager(libcontainerCgroupConfig, nil)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
|
|
|
// Apply(-1) is a hack to create the cgroup directories for each resource
|
2016-06-21 05:03:59 +00:00
|
|
|
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
|
|
|
|
// configuration to the process with the specified pid.
|
2017-05-26 02:53:09 +00:00
|
|
|
// It creates cgroup files for each subsystems and writes the pid
|
2016-06-21 05:03:59 +00:00
|
|
|
// in the tasks file. We use the function to create all the required
|
|
|
|
// cgroup files but not attach any "real" pid to the cgroup.
|
2016-10-17 17:23:48 +00:00
|
|
|
if err := manager.Apply(-1); err != nil {
|
|
|
|
return err
|
2016-06-21 05:03:59 +00:00
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
|
|
|
// it may confuse why we call set after we do apply, but the issue is that runc
|
|
|
|
// follows a similar pattern. it's needed to ensure cpu quota is set properly.
|
|
|
|
m.Update(cgroupConfig)
|
|
|
|
|
2016-06-21 05:03:59 +00:00
|
|
|
return nil
|
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
|
2017-05-26 02:53:09 +00:00
|
|
|
// Scans through all subsystems to find pids associated with specified cgroup.
|
2016-10-17 17:23:48 +00:00
|
|
|
func (m *cgroupManagerImpl) Pids(name CgroupName) []int {
|
|
|
|
// we need the driver specific name
|
|
|
|
cgroupFsName := m.Name(name)
|
|
|
|
|
|
|
|
// Get a list of processes that we need to kill
|
|
|
|
pidsToKill := sets.NewInt()
|
|
|
|
var pids []int
|
|
|
|
for _, val := range m.subsystems.MountPoints {
|
|
|
|
dir := path.Join(val, cgroupFsName)
|
|
|
|
_, err := os.Stat(dir)
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
// The subsystem pod cgroup is already deleted
|
|
|
|
// do nothing, continue
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Get a list of pids that are still charged to the pod's cgroup
|
|
|
|
pids, err = getCgroupProcs(dir)
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
pidsToKill.Insert(pids...)
|
|
|
|
|
|
|
|
// WalkFunc which is called for each file and directory in the pod cgroup dir
|
|
|
|
visitor := func(path string, info os.FileInfo, err error) error {
|
2017-03-10 23:32:58 +00:00
|
|
|
if err != nil {
|
|
|
|
glog.V(4).Infof("cgroup manager encountered error scanning cgroup path %q: %v", path, err)
|
|
|
|
return filepath.SkipDir
|
|
|
|
}
|
2016-10-17 17:23:48 +00:00
|
|
|
if !info.IsDir() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
pids, err = getCgroupProcs(path)
|
|
|
|
if err != nil {
|
2017-03-10 23:32:58 +00:00
|
|
|
glog.V(4).Infof("cgroup manager encountered error getting procs for cgroup path %q: %v", path, err)
|
2016-10-17 17:23:48 +00:00
|
|
|
return filepath.SkipDir
|
|
|
|
}
|
|
|
|
pidsToKill.Insert(pids...)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
// Walk through the pod cgroup directory to check if
|
|
|
|
// container cgroups haven't been GCed yet. Get attached processes to
|
|
|
|
// all such unwanted containers under the pod cgroup
|
|
|
|
if err = filepath.Walk(dir, visitor); err != nil {
|
2017-03-10 23:32:58 +00:00
|
|
|
glog.V(4).Infof("cgroup manager encountered error scanning pids for directory: %q: %v", dir, err)
|
2016-10-17 17:23:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return pidsToKill.List()
|
|
|
|
}
|
|
|
|
|
|
|
|
// ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value
|
|
|
|
func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
|
|
|
|
// Set lowest possible CpuShares value for the cgroup
|
2017-09-05 19:38:57 +00:00
|
|
|
minimumCPUShares := uint64(MinShares)
|
2016-10-17 17:23:48 +00:00
|
|
|
resources := &ResourceConfig{
|
|
|
|
CpuShares: &minimumCPUShares,
|
|
|
|
}
|
|
|
|
containerConfig := &CgroupConfig{
|
|
|
|
Name: cgroupName,
|
|
|
|
ResourceParameters: resources,
|
|
|
|
}
|
|
|
|
return m.Update(containerConfig)
|
|
|
|
}
|
2017-02-28 21:03:06 +00:00
|
|
|
|
2017-05-26 02:53:09 +00:00
|
|
|
func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
|
2017-02-28 21:03:06 +00:00
|
|
|
stats := libcontainercgroups.NewStats()
|
2018-01-31 23:09:03 +00:00
|
|
|
for sys, required := range getSupportedSubsystems() {
|
2017-02-28 21:03:06 +00:00
|
|
|
if _, ok := cgroupPaths[sys.Name()]; !ok {
|
2018-01-31 23:09:03 +00:00
|
|
|
if required {
|
|
|
|
return nil, fmt.Errorf("Failed to find subsystem mount for required subsystem: %v", sys.Name())
|
|
|
|
}
|
|
|
|
// the cgroup is not mounted, but its not required so continue...
|
|
|
|
glog.V(6).Infof("Unable to find subsystem mount for optional subsystem: %v", sys.Name())
|
|
|
|
continue
|
2017-02-28 21:03:06 +00:00
|
|
|
}
|
|
|
|
if err := sys.GetStats(cgroupPaths[sys.Name()], stats); err != nil {
|
|
|
|
return nil, fmt.Errorf("Failed to get stats for supported subsystems : %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return stats, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats {
|
|
|
|
return &ResourceStats{
|
|
|
|
MemoryStats: &MemoryStats{
|
|
|
|
Usage: int64(stats.MemoryStats.Usage.Usage),
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs
|
|
|
|
func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) {
|
|
|
|
cgroupPaths := m.buildCgroupPaths(name)
|
2017-05-26 02:53:09 +00:00
|
|
|
stats, err := getStatsSupportedSubsystems(cgroupPaths)
|
2017-02-28 21:03:06 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
|
|
|
|
}
|
|
|
|
return toResourceStats(stats), nil
|
|
|
|
}
|