diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 3cf7984a25..0d3bf2663b 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -25,6 +25,7 @@ go_library( "//pkg/api/unversioned:go_default_library", "//pkg/client/record:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", + "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/kubelet/server/stats:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 9f56a729aa..0e98156493 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -24,7 +24,9 @@ import ( "github.com/golang/glog" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/resource" "k8s.io/kubernetes/pkg/client/record" + "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" @@ -33,7 +35,7 @@ import ( "k8s.io/kubernetes/pkg/util/wait" ) -// managerImpl implements NodeStabilityManager +// managerImpl implements Manager type managerImpl struct { // used to track time clock clock.Clock @@ -65,6 +67,8 @@ type managerImpl struct { resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs // last observations from synchronize lastObservations signalObservations + // notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once) + notifiersInitialized bool } // ensure it implements the required interface @@ -139,6 +143,39 @@ func (m *managerImpl) IsUnderDiskPressure() bool { return hasNodeCondition(m.nodeConditions, api.NodeDiskPressure) } +func startMemoryThresholdNotifier(thresholds []Threshold, observations signalObservations, hard bool, handler thresholdNotifierHandlerFunc) error { + for _, threshold := range thresholds { + if threshold.Signal != SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) { + continue + } + observed, found := observations[SignalMemoryAvailable] + if !found { + continue + } + cgroups, err := cm.GetCgroupSubsystems() + if err != nil { + return err + } + // TODO add support for eviction from --cgroup-root + cgpath, found := cgroups.MountPoints["memory"] + if !found || len(cgpath) == 0 { + return fmt.Errorf("memory cgroup mount point not found") + } + attribute := "memory.usage_in_bytes" + quantity := getThresholdQuantity(threshold.Value, observed.capacity) + usageThreshold := resource.NewQuantity(observed.capacity.Value(), resource.DecimalSI) + usageThreshold.Sub(*quantity) + description := fmt.Sprintf("<%s available", formatThresholdValue(threshold.Value)) + memcgThresholdNotifier, err := NewMemCGThresholdNotifier(cgpath, attribute, usageThreshold.String(), description, handler) + if err != nil { + return err + } + go memcgThresholdNotifier.Start(wait.NeverStop) + return nil + } + return nil +} + // synchronize is the main control loop that enforces eviction thresholds. func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) { // if we have nothing to do, just return @@ -166,8 +203,27 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act return } - // find the list of thresholds that are met independent of grace period - now := m.clock.Now() + // attempt to create a threshold notifier to improve eviction response time + if !m.notifiersInitialized { + m.notifiersInitialized = true + // start soft memory notification + err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) { + glog.Infof("soft memory eviction threshold crossed at %s", desc) + // TODO wait grace period for soft memory limit + m.synchronize(diskInfoProvider, podFunc) + }) + if err != nil { + glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err) + } + // start hard memory notification + err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) { + glog.Infof("hard memory eviction threshold crossed at %s", desc) + m.synchronize(diskInfoProvider, podFunc) + }) + if err != nil { + glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err) + } + } // determine the set of thresholds met independent of grace period thresholds = thresholdsMet(thresholds, observations, false) @@ -182,6 +238,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations) // track when a threshold was first observed + now := m.clock.Now() thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now) // the set of node conditions that are triggered by currently observed thresholds @@ -218,7 +275,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim) // determine if this is a soft or hard eviction associated with the resource - softEviction := isSoftEviction(thresholds, resourceToReclaim) + softEviction := isSoftEvictionThresholds(thresholds, resourceToReclaim) // record an event about the resources we are now attempting to reclaim via eviction m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 290ca66226..b6bb414241 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -848,18 +848,23 @@ func getStarvedResources(thresholds []Threshold) []api.ResourceName { } // isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds -func isSoftEviction(thresholds []Threshold, starvedResource api.ResourceName) bool { +func isSoftEvictionThresholds(thresholds []Threshold, starvedResource api.ResourceName) bool { for _, threshold := range thresholds { if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource { continue } - if threshold.GracePeriod == time.Duration(0) { + if isHardEvictionThreshold(threshold) { return false } } return true } +// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds +func isHardEvictionThreshold(threshold Threshold) bool { + return threshold.GracePeriod == time.Duration(0) +} + // buildResourceToRankFunc returns ranking functions associated with resources func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc { resourceToRankFunc := map[api.ResourceName]rankFunc{ diff --git a/pkg/kubelet/eviction/threshold_notifier.go b/pkg/kubelet/eviction/threshold_notifier.go new file mode 100644 index 0000000000..e6b58376b4 --- /dev/null +++ b/pkg/kubelet/eviction/threshold_notifier.go @@ -0,0 +1,124 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package eviction + +/* +#include +*/ +import "C" + +import ( + "fmt" + "syscall" + + "github.com/golang/glog" +) + +// ThresholdNotifier notifies the user when an attribute crosses a threshold value +type ThresholdNotifier interface { + Start(stopCh <-chan struct{}) +} + +type memcgThresholdNotifier struct { + watchfd int + controlfd int + eventfd int + handler thresholdNotifierHandlerFunc + description string +} + +var _ ThresholdNotifier = &memcgThresholdNotifier{} + +// NewMemCGThresholdNotifier sends notifications when a cgroup threshold +// is crossed (in either direction) for a given cgroup attribute +func NewMemCGThresholdNotifier(path, attribute, threshold, description string, handler thresholdNotifierHandlerFunc) (ThresholdNotifier, error) { + watchfd, err := syscall.Open(fmt.Sprintf("%s/%s", path, attribute), syscall.O_RDONLY, 0) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + syscall.Close(watchfd) + } + }() + controlfd, err := syscall.Open(fmt.Sprintf("%s/cgroup.event_control", path), syscall.O_WRONLY, 0) + if err != nil { + return nil, err + } + defer func() { + if err != nil { + syscall.Close(controlfd) + } + }() + efd, err := C.eventfd(0, C.EFD_CLOEXEC) + if err != nil { + return nil, err + } + eventfd := int(efd) + if eventfd < 0 { + err = fmt.Errorf("eventfd call failed") + return nil, err + } + defer func() { + if err != nil { + syscall.Close(eventfd) + } + }() + glog.V(2).Infof("eviction: setting notification threshold to %s", threshold) + config := fmt.Sprintf("%d %d %s", eventfd, watchfd, threshold) + _, err = syscall.Write(controlfd, []byte(config)) + if err != nil { + return nil, err + } + return &memcgThresholdNotifier{ + watchfd: watchfd, + controlfd: controlfd, + eventfd: eventfd, + handler: handler, + description: description, + }, nil +} + +func getThresholdEvents(eventfd int, eventCh chan<- int) { + for { + buf := make([]byte, 8) + _, err := syscall.Read(eventfd, buf) + if err != nil { + return + } + eventCh <- 0 + } +} + +func (n *memcgThresholdNotifier) Start(stopCh <-chan struct{}) { + eventCh := make(chan int, 1) + go getThresholdEvents(n.eventfd, eventCh) + for { + select { + case <-stopCh: + glog.V(2).Infof("eviction: stopping threshold notifier") + syscall.Close(n.watchfd) + syscall.Close(n.controlfd) + syscall.Close(n.eventfd) + close(eventCh) + return + case <-eventCh: + glog.V(2).Infof("eviction: threshold crossed") + n.handler(n.description) + } + } +} diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index 0a20611e8c..e373a98d29 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -161,3 +161,6 @@ type nodeReclaimFunc func() (*resource.Quantity, error) // nodeReclaimFuncs is an ordered list of nodeReclaimFunc type nodeReclaimFuncs []nodeReclaimFunc + +// thresholdNotifierHandlerFunc is a function that takes action in response to a crossed threshold +type thresholdNotifierHandlerFunc func(thresholdDescription string)