mirror of https://github.com/k3s-io/k3s
Merge pull request #32577 from sjenning/memcg-notification-wip
Automatic merge from submit-queue kubelet: eviction: add memcg threshold notifier to improve eviction responsiveness This PR adds the ability for the eviction code to get immediate notification from the kernel when the available memory in the root cgroup falls below a user defined threshold, controlled by setting the `memory.available` siginal with the `--eviction-hard` flag. This PR by itself, doesn't change anything as the frequency at which new stats can be obtained is currently controlled by the cadvisor housekeeping interval. That being the case, the call to `synchronize()` by the notification loop will very likely get stale stats and not act any more quickly than it does now. However, whenever cadvisor does get on-demand stat gathering ability, this will improve eviction responsiveness by getting async notification of the root cgroup memory state rather than relying on polling cadvisor. @vishh @derekwaynecarr @kubernetes/rh-cluster-infrapull/6/head
commit
f8d8831c71
|
@ -25,6 +25,7 @@ go_library(
|
|||
"//pkg/api/unversioned:go_default_library",
|
||||
"//pkg/client/record:go_default_library",
|
||||
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
|
||||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/qos:go_default_library",
|
||||
"//pkg/kubelet/server/stats:go_default_library",
|
||||
|
|
|
@ -24,7 +24,9 @@ import (
|
|||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/resource"
|
||||
"k8s.io/kubernetes/pkg/client/record"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
"k8s.io/kubernetes/pkg/kubelet/server/stats"
|
||||
|
@ -33,7 +35,7 @@ import (
|
|||
"k8s.io/kubernetes/pkg/util/wait"
|
||||
)
|
||||
|
||||
// managerImpl implements NodeStabilityManager
|
||||
// managerImpl implements Manager
|
||||
type managerImpl struct {
|
||||
// used to track time
|
||||
clock clock.Clock
|
||||
|
@ -65,6 +67,8 @@ type managerImpl struct {
|
|||
resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs
|
||||
// last observations from synchronize
|
||||
lastObservations signalObservations
|
||||
// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
|
||||
notifiersInitialized bool
|
||||
}
|
||||
|
||||
// ensure it implements the required interface
|
||||
|
@ -139,6 +143,39 @@ func (m *managerImpl) IsUnderDiskPressure() bool {
|
|||
return hasNodeCondition(m.nodeConditions, api.NodeDiskPressure)
|
||||
}
|
||||
|
||||
func startMemoryThresholdNotifier(thresholds []Threshold, observations signalObservations, hard bool, handler thresholdNotifierHandlerFunc) error {
|
||||
for _, threshold := range thresholds {
|
||||
if threshold.Signal != SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) {
|
||||
continue
|
||||
}
|
||||
observed, found := observations[SignalMemoryAvailable]
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
cgroups, err := cm.GetCgroupSubsystems()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// TODO add support for eviction from --cgroup-root
|
||||
cgpath, found := cgroups.MountPoints["memory"]
|
||||
if !found || len(cgpath) == 0 {
|
||||
return fmt.Errorf("memory cgroup mount point not found")
|
||||
}
|
||||
attribute := "memory.usage_in_bytes"
|
||||
quantity := getThresholdQuantity(threshold.Value, observed.capacity)
|
||||
usageThreshold := resource.NewQuantity(observed.capacity.Value(), resource.DecimalSI)
|
||||
usageThreshold.Sub(*quantity)
|
||||
description := fmt.Sprintf("<%s available", formatThresholdValue(threshold.Value))
|
||||
memcgThresholdNotifier, err := NewMemCGThresholdNotifier(cgpath, attribute, usageThreshold.String(), description, handler)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
go memcgThresholdNotifier.Start(wait.NeverStop)
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// synchronize is the main control loop that enforces eviction thresholds.
|
||||
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) {
|
||||
// if we have nothing to do, just return
|
||||
|
@ -166,8 +203,27 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||
return
|
||||
}
|
||||
|
||||
// find the list of thresholds that are met independent of grace period
|
||||
now := m.clock.Now()
|
||||
// attempt to create a threshold notifier to improve eviction response time
|
||||
if !m.notifiersInitialized {
|
||||
m.notifiersInitialized = true
|
||||
// start soft memory notification
|
||||
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) {
|
||||
glog.Infof("soft memory eviction threshold crossed at %s", desc)
|
||||
// TODO wait grace period for soft memory limit
|
||||
m.synchronize(diskInfoProvider, podFunc)
|
||||
})
|
||||
if err != nil {
|
||||
glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err)
|
||||
}
|
||||
// start hard memory notification
|
||||
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) {
|
||||
glog.Infof("hard memory eviction threshold crossed at %s", desc)
|
||||
m.synchronize(diskInfoProvider, podFunc)
|
||||
})
|
||||
if err != nil {
|
||||
glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// determine the set of thresholds met independent of grace period
|
||||
thresholds = thresholdsMet(thresholds, observations, false)
|
||||
|
@ -182,6 +238,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||
thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
|
||||
|
||||
// track when a threshold was first observed
|
||||
now := m.clock.Now()
|
||||
thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
|
||||
|
||||
// the set of node conditions that are triggered by currently observed thresholds
|
||||
|
@ -218,7 +275,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||
glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
|
||||
|
||||
// determine if this is a soft or hard eviction associated with the resource
|
||||
softEviction := isSoftEviction(thresholds, resourceToReclaim)
|
||||
softEviction := isSoftEvictionThresholds(thresholds, resourceToReclaim)
|
||||
|
||||
// record an event about the resources we are now attempting to reclaim via eviction
|
||||
m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
|
||||
|
|
|
@ -848,18 +848,23 @@ func getStarvedResources(thresholds []Threshold) []api.ResourceName {
|
|||
}
|
||||
|
||||
// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
|
||||
func isSoftEviction(thresholds []Threshold, starvedResource api.ResourceName) bool {
|
||||
func isSoftEvictionThresholds(thresholds []Threshold, starvedResource api.ResourceName) bool {
|
||||
for _, threshold := range thresholds {
|
||||
if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource {
|
||||
continue
|
||||
}
|
||||
if threshold.GracePeriod == time.Duration(0) {
|
||||
if isHardEvictionThreshold(threshold) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
|
||||
func isHardEvictionThreshold(threshold Threshold) bool {
|
||||
return threshold.GracePeriod == time.Duration(0)
|
||||
}
|
||||
|
||||
// buildResourceToRankFunc returns ranking functions associated with resources
|
||||
func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
|
||||
resourceToRankFunc := map[api.ResourceName]rankFunc{
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
Copyright 2016 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package eviction
|
||||
|
||||
/*
|
||||
#include <sys/eventfd.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"syscall"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// ThresholdNotifier notifies the user when an attribute crosses a threshold value
|
||||
type ThresholdNotifier interface {
|
||||
Start(stopCh <-chan struct{})
|
||||
}
|
||||
|
||||
type memcgThresholdNotifier struct {
|
||||
watchfd int
|
||||
controlfd int
|
||||
eventfd int
|
||||
handler thresholdNotifierHandlerFunc
|
||||
description string
|
||||
}
|
||||
|
||||
var _ ThresholdNotifier = &memcgThresholdNotifier{}
|
||||
|
||||
// NewMemCGThresholdNotifier sends notifications when a cgroup threshold
|
||||
// is crossed (in either direction) for a given cgroup attribute
|
||||
func NewMemCGThresholdNotifier(path, attribute, threshold, description string, handler thresholdNotifierHandlerFunc) (ThresholdNotifier, error) {
|
||||
watchfd, err := syscall.Open(fmt.Sprintf("%s/%s", path, attribute), syscall.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
syscall.Close(watchfd)
|
||||
}
|
||||
}()
|
||||
controlfd, err := syscall.Open(fmt.Sprintf("%s/cgroup.event_control", path), syscall.O_WRONLY, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
syscall.Close(controlfd)
|
||||
}
|
||||
}()
|
||||
efd, err := C.eventfd(0, C.EFD_CLOEXEC)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
eventfd := int(efd)
|
||||
if eventfd < 0 {
|
||||
err = fmt.Errorf("eventfd call failed")
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
syscall.Close(eventfd)
|
||||
}
|
||||
}()
|
||||
glog.V(2).Infof("eviction: setting notification threshold to %s", threshold)
|
||||
config := fmt.Sprintf("%d %d %s", eventfd, watchfd, threshold)
|
||||
_, err = syscall.Write(controlfd, []byte(config))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &memcgThresholdNotifier{
|
||||
watchfd: watchfd,
|
||||
controlfd: controlfd,
|
||||
eventfd: eventfd,
|
||||
handler: handler,
|
||||
description: description,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getThresholdEvents(eventfd int, eventCh chan<- int) {
|
||||
for {
|
||||
buf := make([]byte, 8)
|
||||
_, err := syscall.Read(eventfd, buf)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
eventCh <- 0
|
||||
}
|
||||
}
|
||||
|
||||
func (n *memcgThresholdNotifier) Start(stopCh <-chan struct{}) {
|
||||
eventCh := make(chan int, 1)
|
||||
go getThresholdEvents(n.eventfd, eventCh)
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
glog.V(2).Infof("eviction: stopping threshold notifier")
|
||||
syscall.Close(n.watchfd)
|
||||
syscall.Close(n.controlfd)
|
||||
syscall.Close(n.eventfd)
|
||||
close(eventCh)
|
||||
return
|
||||
case <-eventCh:
|
||||
glog.V(2).Infof("eviction: threshold crossed")
|
||||
n.handler(n.description)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -161,3 +161,6 @@ type nodeReclaimFunc func() (*resource.Quantity, error)
|
|||
|
||||
// nodeReclaimFuncs is an ordered list of nodeReclaimFunc
|
||||
type nodeReclaimFuncs []nodeReclaimFunc
|
||||
|
||||
// thresholdNotifierHandlerFunc is a function that takes action in response to a crossed threshold
|
||||
type thresholdNotifierHandlerFunc func(thresholdDescription string)
|
||||
|
|
Loading…
Reference in New Issue