From cc5f5474d5fd6886a53a55555f5a6043c8e58778 Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Thu, 9 Feb 2017 21:14:10 -0800 Subject: [PATCH] add support for node allocatable phase 2 to kubelet Signed-off-by: Vishnu Kannan --- cmd/kubelet/app/BUILD | 5 +- cmd/kubelet/app/options/options.go | 4 +- cmd/kubelet/app/server.go | 135 ++++-- cmd/kubelet/app/server_test.go | 19 +- hack/.linted_packages | 1 + hack/local-up-cluster.sh | 4 +- hack/verify-flags/exceptions.txt | 4 +- hack/verify-flags/known-flags.txt | 60 ++- pkg/kubelet/cm/BUILD | 6 + pkg/kubelet/cm/container_manager.go | 12 + pkg/kubelet/cm/container_manager_linux.go | 68 ++- pkg/kubelet/cm/container_manager_stub.go | 4 + .../cm/container_manager_unsupported.go | 7 +- pkg/kubelet/cm/container_manager_windows.go | 3 +- pkg/kubelet/cm/node_allocatable.go | 164 ------- pkg/kubelet/cm/node_container_manager.go | 229 ++++++++++ pkg/kubelet/cm/node_container_manager_test.go | 305 +++++++++++++ pkg/kubelet/events/event.go | 46 +- pkg/kubelet/eviction/BUILD | 7 +- pkg/kubelet/eviction/api/BUILD | 28 ++ pkg/kubelet/eviction/api/types.go | 79 ++++ pkg/kubelet/eviction/eviction_manager.go | 11 +- pkg/kubelet/eviction/eviction_manager_test.go | 77 ++-- pkg/kubelet/eviction/helpers.go | 155 ++++--- pkg/kubelet/eviction/helpers_test.go | 405 +++++++++--------- pkg/kubelet/eviction/types.go | 55 +-- pkg/kubelet/kubelet.go | 52 --- pkg/kubelet/kubelet_node_status.go | 16 +- pkg/kubelet/kubelet_node_status_test.go | 40 +- pkg/kubelet/kubelet_resources.go | 4 +- pkg/kubelet/kubelet_resources_test.go | 25 +- pkg/kubelet/kubelet_test.go | 6 - test/e2e_node/BUILD | 4 +- test/e2e_node/container_manager_test.go | 3 +- test/e2e_node/node_container_manager_test.go | 247 +++++++++++ ...test.go => pods_container_manager_test.go} | 10 +- test/e2e_node/util.go | 7 + 37 files changed, 1577 insertions(+), 730 deletions(-) delete mode 100644 pkg/kubelet/cm/node_allocatable.go create mode 100644 pkg/kubelet/cm/node_container_manager.go create mode 100644 pkg/kubelet/cm/node_container_manager_test.go create mode 100644 pkg/kubelet/eviction/api/BUILD create mode 100644 pkg/kubelet/eviction/api/types.go create mode 100644 test/e2e_node/node_container_manager_test.go rename test/e2e_node/{cgroup_manager_test.go => pods_container_manager_test.go} (96%) diff --git a/cmd/kubelet/app/BUILD b/cmd/kubelet/app/BUILD index 066d3fa1d2..ef13d2528e 100644 --- a/cmd/kubelet/app/BUILD +++ b/cmd/kubelet/app/BUILD @@ -18,7 +18,6 @@ go_test( tags = ["automanaged"], deps = [ "//pkg/apis/componentconfig:go_default_library", - "//pkg/kubelet:go_default_library", "//vendor:k8s.io/apimachinery/pkg/util/diff", "//vendor:k8s.io/client-go/rest", ], @@ -56,6 +55,8 @@ go_library( "//pkg/kubelet/config:go_default_library", "//pkg/kubelet/container:go_default_library", "//pkg/kubelet/dockertools:go_default_library", + "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/network:go_default_library", "//pkg/kubelet/network/cni:go_default_library", "//pkg/kubelet/network/kubenet:go_default_library", @@ -98,10 +99,12 @@ go_library( "//vendor:github.com/spf13/cobra", "//vendor:github.com/spf13/pflag", "//vendor:golang.org/x/exp/inotify", + "//vendor:k8s.io/apimachinery/pkg/api/resource", "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1", "//vendor:k8s.io/apimachinery/pkg/runtime", "//vendor:k8s.io/apimachinery/pkg/types", "//vendor:k8s.io/apimachinery/pkg/util/runtime", + "//vendor:k8s.io/apimachinery/pkg/util/sets", "//vendor:k8s.io/apimachinery/pkg/util/wait", "//vendor:k8s.io/apiserver/pkg/authentication/authenticator", "//vendor:k8s.io/apiserver/pkg/authentication/authenticatorfactory", diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 069f4d05a4..4ef0d35f9e 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -225,7 +225,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.Float64Var(&s.ChaosChance, "chaos-chance", s.ChaosChance, "If > 0.0, introduce random client errors and latency. Intended for testing. [default=0.0]") fs.BoolVar(&s.Containerized, "containerized", s.Containerized, "Experimental support for running kubelet in a container. Intended for testing. [default=false]") fs.Int64Var(&s.MaxOpenFiles, "max-open-files", s.MaxOpenFiles, "Number of files that can be opened by Kubelet process. [default=1000000]") - fs.BoolVar(&s.RegisterSchedulable, "register-schedulable", s.RegisterSchedulable, "Register the node as schedulable. Won't have any effect if register-node is false. [default=true]") fs.MarkDeprecated("register-schedulable", "will be removed in a future version") fs.Var(utiltaints.NewTaintsVar(&s.RegisterWithTaints), "register-with-taints", "Register the node with the given list of taints (comma seperated \"=:\"). No-op if register-node is false.") @@ -267,9 +266,8 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { // Node Allocatable Flags fs.Var(&s.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") fs.Var(&s.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") - fs.StringSliceVar(&s.EnforceNodeAllocatable, "enforce-node-allocatable", s.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptible options are 'pods', 'system-reserved' & 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' & '--kube-reserved-cgroup' must also be set respectively. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default='']") fs.StringVar(&s.SystemReservedCgroup, "system-reserved-cgroup", s.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") fs.StringVar(&s.KubeReservedCgroup, "kube-reserved-cgroup", s.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") - fs.BoolVar(&s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-node-allocatable-ignore-eviction-threshold", s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default=false]") + fs.BoolVar(&s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-allocatable-ignore-eviction", s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default=false]") } diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 2bc489741e..89f5639ca2 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -36,6 +36,7 @@ import ( "github.com/spf13/cobra" "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -70,6 +71,8 @@ import ( "k8s.io/kubernetes/pkg/kubelet/config" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/dockertools" + "k8s.io/kubernetes/pkg/kubelet/eviction" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/server" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/util/configz" @@ -82,12 +85,17 @@ import ( "k8s.io/kubernetes/pkg/version" ) +const ( + // Kubelet component name + componentKubelet = "kubelet" +) + // NewKubeletCommand creates a *cobra.Command object with default parameters func NewKubeletCommand() *cobra.Command { s := options.NewKubeletServer() s.AddFlags(pflag.CommandLine) cmd := &cobra.Command{ - Use: "kubelet", + Use: componentKubelet, Long: `The kubelet is the primary "node agent" that runs on each node. The kubelet works in terms of a PodSpec. A PodSpec is a YAML or JSON object that describes a pod. The kubelet takes a set of PodSpecs that are provided through @@ -306,6 +314,44 @@ func initConfigz(kc *componentconfig.KubeletConfiguration) (*configz.Config, err return cz, err } +// validateConfig validates configuration of Kubelet and returns an error is the input configuration is invalid. +func validateConfig(s *options.KubeletServer) error { + if !s.CgroupsPerQOS && len(s.EnforceNodeAllocatable) > 0 { + return fmt.Errorf("Node Allocatable enforcement is not supported unless Cgroups Per QOS feature is turned on") + } + if s.SystemCgroups != "" && s.CgroupRoot == "" { + return fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified") + } + for _, val := range s.EnforceNodeAllocatable { + switch val { + case cm.NodeAllocatableEnforcementKey: + case cm.SystemReservedEnforcementKey: + case cm.KubeReservedEnforcementKey: + continue + default: + return fmt.Errorf("invalid option %q specified for EnforceNodeAllocatable setting. Valid options are %q, %q or %q", val, cm.NodeAllocatableEnforcementKey, cm.SystemReservedEnforcementKey, cm.KubeReservedEnforcementKey) + } + } + return nil +} + +// makeEventRecorder sets up kubeDeps.Recorder if its nil. Its a no-op otherwise. +func makeEventRecorder(s *componentconfig.KubeletConfiguration, kubeDeps *kubelet.KubeletDeps, nodeName types.NodeName) { + if kubeDeps.Recorder != nil { + return + } + eventBroadcaster := record.NewBroadcaster() + kubeDeps.Recorder = eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: componentKubelet, Host: string(nodeName)}) + eventBroadcaster.StartLogging(glog.V(3).Infof) + if kubeDeps.EventClient != nil { + glog.V(4).Infof("Sending events to api server.") + eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeDeps.EventClient.Events("")}) + } else { + glog.Warning("No api server defined - no events will be sent to API server.") + } + +} + func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { // TODO: this should be replaced by a --standalone flag standaloneMode := (len(s.APIServerList) == 0 && !s.RequireKubeConfig) @@ -363,6 +409,11 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { } } + // Validate configuration. + if err := validateConfig(s); err != nil { + return err + } + if kubeDeps == nil { var kubeClient clientset.Interface var eventClient v1core.EventsGetter @@ -381,11 +432,12 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { } } + nodeName, err := getNodeName(cloud, nodeutil.GetHostname(s.HostnameOverride)) + if err != nil { + return err + } + if s.BootstrapKubeconfig != "" { - nodeName, err := getNodeName(cloud, nodeutil.GetHostname(s.HostnameOverride)) - if err != nil { - return err - } if err := bootstrapClientCert(s.KubeConfig.Value(), s.BootstrapKubeconfig, s.CertDirectory, nodeName); err != nil { return err } @@ -429,12 +481,12 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { kubeDeps.EventClient = eventClient } - if kubeDeps.Auth == nil { - nodeName, err := getNodeName(kubeDeps.Cloud, nodeutil.GetHostname(s.HostnameOverride)) - if err != nil { - return err - } + nodeName, err := getNodeName(kubeDeps.Cloud, nodeutil.GetHostname(s.HostnameOverride)) + if err != nil { + return err + } + if kubeDeps.Auth == nil { auth, err := buildAuth(nodeName, kubeDeps.ExternalKubeClient, s.KubeletConfiguration) if err != nil { return err @@ -449,14 +501,30 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { } } + // Setup event recorder if required. + makeEventRecorder(&s.KubeletConfiguration, kubeDeps, nodeName) + if kubeDeps.ContainerManager == nil { - if s.SystemCgroups != "" && s.CgroupRoot == "" { - return fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified") - } if s.CgroupsPerQOS && s.CgroupRoot == "" { glog.Infof("--cgroups-per-qos enabled, but --cgroup-root was not specified. defaulting to /") s.CgroupRoot = "/" } + kubeReserved, err := parseResourceList(s.KubeReserved) + if err != nil { + return err + } + systemReserved, err := parseResourceList(s.SystemReserved) + if err != nil { + return err + } + var hardEvictionThresholds []evictionapi.Threshold + // If the user requested to ignore eviction thresholds, then do not set valid values for hardEvictionThresholds here. + if !s.ExperimentalNodeAllocatableIgnoreEvictionThreshold { + hardEvictionThresholds, err = eviction.ParseThresholdConfig(s.EvictionHard, "", "", "") + if err != nil { + return err + } + } kubeDeps.ContainerManager, err = cm.NewContainerManager( kubeDeps.Mounter, kubeDeps.CAdvisorInterface, @@ -479,7 +547,8 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { HardEvictionThresholds: hardEvictionThresholds, }, }, - s.ExperimentalFailSwapOn) + s.ExperimentalFailSwapOn, + kubeDeps.Recorder) if err != nil { return err @@ -694,16 +763,8 @@ func RunKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *kubelet if err != nil { return err } - - eventBroadcaster := record.NewBroadcaster() - kubeDeps.Recorder = eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "kubelet", Host: string(nodeName)}) - eventBroadcaster.StartLogging(glog.V(3).Infof) - if kubeDeps.EventClient != nil { - glog.V(4).Infof("Sending events to api server.") - eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: kubeDeps.EventClient.Events("")}) - } else { - glog.Warning("No api server defined - no events will be sent to API server.") - } + // Setup event recorder if required. + makeEventRecorder(kubeCfg, kubeDeps, nodeName) // TODO(mtaufen): I moved the validation of these fields here, from UnsecuredKubeletConfig, // so that I could remove the associated fields from KubeletConfig. I would @@ -837,3 +898,29 @@ func CreateAndInitKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDep return k, nil } + +// parseResourceList parses the given configuration map into an API +// ResourceList or returns an error. +func parseResourceList(m componentconfig.ConfigurationMap) (v1.ResourceList, error) { + if len(m) == 0 { + return nil, nil + } + rl := make(v1.ResourceList) + for k, v := range m { + switch v1.ResourceName(k) { + // Only CPU and memory resources are supported. + case v1.ResourceCPU, v1.ResourceMemory: + q, err := resource.ParseQuantity(v) + if err != nil { + return nil, err + } + if q.Sign() == -1 { + return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v) + } + rl[v1.ResourceName(k)] = q + default: + return nil, fmt.Errorf("cannot reserve %q resource", k) + } + } + return rl, nil +} diff --git a/cmd/kubelet/app/server_test.go b/cmd/kubelet/app/server_test.go index 2da0e10491..0a5cbe4150 100644 --- a/cmd/kubelet/app/server_test.go +++ b/cmd/kubelet/app/server_test.go @@ -20,7 +20,6 @@ import ( "testing" "k8s.io/kubernetes/pkg/apis/componentconfig" - "k8s.io/kubernetes/pkg/kubelet" ) func TestValueOfAllocatableResources(t *testing.T) { @@ -32,13 +31,13 @@ func TestValueOfAllocatableResources(t *testing.T) { }{ { kubeReserved: "cpu=200m,memory=-150G", - systemReserved: "cpu=200m,memory=150G", + systemReserved: "cpu=200m,memory=15Ki", errorExpected: true, name: "negative quantity value", }, { - kubeReserved: "cpu=200m,memory=150GG", - systemReserved: "cpu=200m,memory=150G", + kubeReserved: "cpu=200m,memory=150Gi", + systemReserved: "cpu=200m,memory=15Ky", errorExpected: true, name: "invalid quantity unit", }, @@ -57,17 +56,15 @@ func TestValueOfAllocatableResources(t *testing.T) { kubeReservedCM.Set(test.kubeReserved) systemReservedCM.Set(test.systemReserved) - _, err := kubelet.ParseReservation(kubeReservedCM, systemReservedCM) - if err != nil { - t.Logf("%s: error returned: %v", test.name, err) - } + _, err1 := parseResourceList(kubeReservedCM) + _, err2 := parseResourceList(systemReservedCM) if test.errorExpected { - if err == nil { + if err1 == nil && err2 == nil { t.Errorf("%s: error expected", test.name) } } else { - if err != nil { - t.Errorf("%s: unexpected error: %v", test.name, err) + if err1 != nil || err2 != nil { + t.Errorf("%s: unexpected error: %v, %v", test.name, err1, err2) } } } diff --git a/hack/.linted_packages b/hack/.linted_packages index a380e77860..3b77be367c 100644 --- a/hack/.linted_packages +++ b/hack/.linted_packages @@ -173,6 +173,7 @@ pkg/kubelet/api pkg/kubelet/container pkg/kubelet/envvars pkg/kubelet/eviction +pkg/kubelet/eviction/api pkg/kubelet/util/csr pkg/kubelet/util/format pkg/kubelet/util/ioutils diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh index 4fef9ca963..d1ab80f148 100755 --- a/hack/local-up-cluster.sh +++ b/hack/local-up-cluster.sh @@ -29,6 +29,7 @@ RUNTIME_CONFIG=${RUNTIME_CONFIG:-""} KUBELET_AUTHORIZATION_WEBHOOK=${KUBELET_AUTHORIZATION_WEBHOOK:-""} KUBELET_AUTHENTICATION_WEBHOOK=${KUBELET_AUTHENTICATION_WEBHOOK:-""} POD_MANIFEST_PATH=${POD_MANIFEST_PATH:-"/var/run/kubernetes/static-pods"} +KUBELET_FLAGS=${KUBELET_FLAGS:-""} # Name of the network plugin, eg: "kubenet" NET_PLUGIN=${NET_PLUGIN:-""} # Place the binaries required by NET_PLUGIN in this directory, eg: "/home/kubernetes/bin". @@ -603,7 +604,8 @@ function start_kubelet { ${net_plugin_args} \ ${container_runtime_endpoint_args} \ ${image_service_endpoint_args} \ - --port="$KUBELET_PORT" >"${KUBELET_LOG}" 2>&1 & + --port="$KUBELET_PORT" \ + ${KUBELET_FLAGS} >"${KUBELET_LOG}" 2>&1 & KUBELET_PID=$! # Quick check that kubelet is running. if ps -p $KUBELET_PID > /dev/null ; then diff --git a/hack/verify-flags/exceptions.txt b/hack/verify-flags/exceptions.txt index fc7f7c83b0..84692552cb 100644 --- a/hack/verify-flags/exceptions.txt +++ b/hack/verify-flags/exceptions.txt @@ -14,7 +14,6 @@ cluster/gce/configure-vm.sh: cloud_config: ${CLOUD_CONFIG} cluster/gce/configure-vm.sh: env-to-grains "feature_gates" cluster/gce/configure-vm.sh: env-to-grains "runtime_config" cluster/gce/configure-vm.sh: kubelet_api_servers: '${KUBELET_APISERVER}' -cluster/gce/configure-vm.sh: local -r client_ca_file="/srv/salt-overlay/salt/kubelet/ca.crt" cluster/gce/container-linux/configure-helper.sh: authorization_mode+=",ABAC" cluster/gce/container-linux/configure-helper.sh: authorization_mode+=",Webhook" cluster/gce/container-linux/configure-helper.sh: grep -o "{{ *pillar\.get('storage_backend', '\(.*\)') *}}" | \ @@ -40,7 +39,6 @@ cluster/gce/trusty/configure-helper.sh: sed -i -e "s@{{ *pillar\.get('storage cluster/gce/trusty/configure-helper.sh: sed -i -e "s@{{pillar\['allow_privileged'\]}}@true@g" "${src_file}" cluster/gce/util.sh: local node_ip=$(gcloud compute instances describe --project "${PROJECT}" --zone "${ZONE}" \ cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py: context['pillar'] = {'num_nodes': get_node_count()} -cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py: msg = "Cannot change {0} to {1}".format(service_cidr(), cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py: ca_cert_path = layer_options.get('ca_certificate_path') cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py: cluster_dns.set_dns_info(53, hookenv.config('dns_domain'), dns_ip) cluster/juju/layers/kubernetes-master/reactive/kubernetes_master.py: ip = service_cidr().split('/')[0] @@ -171,6 +169,8 @@ test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's o test/e2e_node/container_manager_test.go: return fmt.Errorf("failed to get oom_score_adj for %d", pid) test/e2e_node/container_manager_test.go: return fmt.Errorf("failed to get oom_score_adj for %d: %v", pid, err) test/e2e_node/container_manager_test.go: procfsPath := path.Join("/proc", strconv.Itoa(pid), "oom_score_adj") +test/e2e_node/node_container_manager_test.go: kubeReservedCgroup = "/kube_reserved" +test/e2e_node/node_container_manager_test.go: systemReservedCgroup = "/system_reserved" test/images/mount-tester/mt.go: flag.BoolVar(&breakOnExpectedContent, "break_on_expected_content", true, "Break out of loop on expected content, (use with --file_content_in_loop flag only)") test/images/mount-tester/mt.go: flag.IntVar(&retryDuration, "retry_time", 180, "Retry time during the loop") test/images/mount-tester/mt.go: flag.StringVar(&readFileContentInLoopPath, "file_content_in_loop", "", "Path to read the file content in loop from") diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index be2a8ea62a..d36cac7ed1 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -7,9 +7,9 @@ advertised-address algorithm-provider all-namespaces allocate-node-cidrs +allowed-not-ready-nodes allow-missing-template-keys allow-privileged -allowed-not-ready-nodes anonymous-auth api-advertise-addresses api-burst @@ -19,6 +19,11 @@ api-prefix api-rate api-server-advertise-address api-server-port +apiserver-arg-overrides +apiserver-count +apiserver-count +api-server-port +api-servers api-server-service-type api-servers api-token @@ -31,10 +36,6 @@ audit-log-maxage audit-log-maxbackup audit-log-maxsize audit-log-path -auth-provider -auth-provider -auth-provider-arg -auth-provider-arg authentication-kubeconfig authentication-token-webhook authentication-token-webhook-cache-ttl @@ -46,6 +47,10 @@ authorization-rbac-super-user authorization-webhook-cache-authorized-ttl authorization-webhook-cache-unauthorized-ttl authorization-webhook-config-file +auth-provider +auth-provider +auth-provider-arg +auth-provider-arg azure-container-registry-config babysit-daemons basic-auth-file @@ -155,13 +160,13 @@ dns-port dns-provider dns-provider-config dns-zone-name +dockercfg-path docker-email docker-endpoint docker-exec-handler docker-password docker-server docker-username -dockercfg-path driver-port drop-embedded-fields dry-run @@ -185,6 +190,7 @@ enable-hostpath-provisioner enable-server enable-swagger-ui enable-taint-manager +enforce-node-allocatable etcd-address etcd-cafile etcd-certfile @@ -245,8 +251,8 @@ federated-kube-context federation-name federation-system-namespace file-check-frequency -file-suffix file_content_in_loop +file-suffix flex-volume-plugin-dir forward-services framework-name @@ -282,11 +288,11 @@ heapster-service horizontal-pod-autoscaler-sync-period host-cluster-context host-ipc-sources +hostname-override host-network-sources host-pid-sources host-port-endpoints host-system-namespace -hostname-override http-check-frequency http-port ignore-daemonsets @@ -298,9 +304,9 @@ image-project image-pull-policy image-pull-progress-deadline image-service-endpoint -include-extended-apis -include-extended-apis included-types-overrides +include-extended-apis +include-extended-apis initial-sync-timeout input-base input-dirs @@ -339,6 +345,8 @@ kops-ssh-key kops-state kops-up-timeout kops-zones +kubeadm-cmd-skip +kubeadm-path kube-api-burst kube-api-content-type kube-api-qps @@ -371,6 +379,11 @@ kubelet-read-only-port kubelet-root-dir kubelet-sync-frequency kubelet-timeout +kube-master +kube-master +kube-master-url +kube-reserved +kube-reserved-cgroup kubernetes-anywhere-cluster kubernetes-anywhere-path kubernetes-anywhere-phase2-provider @@ -404,6 +417,8 @@ master-os-distro master-service-namespace max-concurrency max-connection-bytes-per-sec +maximum-dead-containers +maximum-dead-containers-per-container max-log-age max-log-backups max-log-size @@ -413,8 +428,6 @@ max-outgoing-burst max-outgoing-qps max-pods max-requests-inflight -maximum-dead-containers -maximum-dead-containers-per-container mesos-authentication-principal mesos-authentication-provider mesos-authentication-secret-file @@ -430,23 +443,19 @@ mesos-sandbox-overlay mesos-user metrics-path min-available -min-pr-number -min-request-timeout -min-resync-period minimum-container-ttl-duration minimum-image-ttl-duration minion-max-log-age minion-max-log-backups minion-max-log-size minion-path-override +min-pr-number +min-request-timeout +min-resync-period namespace-sync-period network-plugin network-plugin-dir network-plugin-mtu -no-headers -no-headers -no-suggestions -no-suggestions node-cidr-mask-size node-eviction-rate node-instance-group @@ -465,7 +474,11 @@ node-schedulable-timeout node-startup-grace-period node-status-update-frequency node-sync-period +no-headers +no-headers non-masquerade-cidr +no-suggestions +no-suggestions num-nodes oidc-ca-file oidc-client-id @@ -474,7 +487,6 @@ oidc-issuer-url oidc-username-claim only-idl oom-score-adj -out-version outofdisk-transition-frequency output-base output-directory @@ -482,6 +494,7 @@ output-file-base output-package output-print-type output-version +out-version path-override pod-cidr pod-eviction-timeout @@ -506,6 +519,7 @@ proxy-logv proxy-mode proxy-port-range public-address-override +pvclaimbinder-sync-period pv-recycler-increment-timeout-nfs pv-recycler-maximum-retry pv-recycler-minimum-timeout-hostpath @@ -540,8 +554,8 @@ requestheader-client-ca-file requestheader-extra-headers-prefix requestheader-group-headers requestheader-username-headers -require-kubeconfig required-contexts +require-kubeconfig resolv-conf resource resource-container @@ -624,6 +638,7 @@ sync-frequency system-cgroups system-pods-startup-timeout system-reserved +system-reserved-cgroup system-validate-mode target-port target-ram-mb @@ -637,6 +652,7 @@ tls-private-key-file tls-sni-cert-key to-version token-auth-file +to-version ttl-keys-prefix ttl-secs type-src @@ -652,6 +668,7 @@ use-service-account-credentials use-kubernetes-version use-taint-based-evictions user-whitelist +use-service-account-credentials verb verify-only versioned-clientset-package @@ -668,3 +685,4 @@ windows-line-endings www-prefix zone-id zone-name +experimental-allocatable-ignore-eviction diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index 50095fed9d..dddc64123c 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -16,6 +16,7 @@ go_library( "container_manager_linux.go", "container_manager_stub.go", "helpers_linux.go", + "node_container_manager.go", "pod_container_manager_linux.go", "pod_container_manager_stub.go", "types.go", @@ -25,6 +26,8 @@ go_library( "//pkg/api/v1:go_default_library", "//pkg/kubelet/cadvisor:go_default_library", "//pkg/kubelet/cm/util:go_default_library", + "//pkg/kubelet/events:go_default_library", + "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/util:go_default_library", "//pkg/util/mount:go_default_library", @@ -43,6 +46,7 @@ go_library( "//vendor:k8s.io/apimachinery/pkg/util/runtime", "//vendor:k8s.io/apimachinery/pkg/util/sets", "//vendor:k8s.io/apimachinery/pkg/util/wait", + "//vendor:k8s.io/client-go/tools/record", ], ) @@ -52,11 +56,13 @@ go_test( "cgroup_manager_linux_test.go", "container_manager_linux_test.go", "helpers_linux_test.go", + "node_container_manager_test.go", ], library = ":go_default_library", tags = ["automanaged"], deps = [ "//pkg/api/v1:go_default_library", + "//pkg/kubelet/eviction/api:go_default_library", "//pkg/util/mount:go_default_library", "//vendor:github.com/stretchr/testify/assert", "//vendor:github.com/stretchr/testify/require", diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 62f42cfbd7..75228bf3f0 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -18,7 +18,9 @@ package cm import ( "k8s.io/apimachinery/pkg/util/sets" + // TODO: Migrate kubelet to either use its own internal objects or client library. "k8s.io/kubernetes/pkg/api/v1" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" ) // Manages the containers running on a machine. @@ -47,6 +49,9 @@ type ContainerManager interface { // GetQOSContainersInfo returns the names of top level QoS containers GetQOSContainersInfo() QOSContainersInfo + + // GetNodeAllocatable returns the amount of compute resources that have to be reserved from scheduling. + GetNodeAllocatableReservation() v1.ResourceList } type NodeConfig struct { @@ -75,3 +80,10 @@ type Status struct { // Any soft requirements that were unsatisfied. SoftRequirements error } + +const ( + // Uer visible keys for managing node allocatable enforcement on the node. + NodeAllocatableEnforcementKey = "pods" + SystemReservedEnforcementKey = "system-reserved" + KubeReservedEnforcementKey = "kube-reserved" +) diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index c0b1bdd073..4cd4d9b45a 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -38,6 +38,7 @@ import ( "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/tools/record" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/kubelet/cadvisor" cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util" @@ -101,10 +102,20 @@ type containerManagerImpl struct { // External containers being managed. systemContainers []*systemContainer qosContainers QOSContainersInfo - periodicTasks []func() + // Tasks that are run periodically + periodicTasks []func() // holds all the mounted cgroup subsystems subsystems *CgroupSubsystems nodeInfo *v1.Node + // Interface for cgroup management + cgroupManager CgroupManager + // Capacity of this node. + capacity v1.ResourceList + // Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under. + // This path include a top level container for enforcing Node Allocatable. + cgroupRoot string + // Event recorder interface. + recorder record.EventRecorder } type features struct { @@ -167,7 +178,7 @@ func validateSystemRequirements(mountUtil mount.Interface) (features, error) { // TODO(vmarmol): Add limits to the system containers. // Takes the absolute name of the specified containers. // Empty container name disables use of the specified container. -func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool) (ContainerManager, error) { +func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, recorder record.EventRecorder) (ContainerManager, error) { subsystems, err := GetCgroupSubsystems() if err != nil { return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err) @@ -204,7 +215,17 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I "This will be a fatal error by default starting in K8s v1.6! " + "In the meantime, you can opt-in to making this a fatal error by enabling --experimental-fail-swap-on.") } + var capacity = v1.ResourceList{} + // It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because + // machine info is computed and cached once as part of cAdvisor object creation. + if info, err := cadvisorInterface.MachineInfo(); err == nil { + capacity = cadvisor.CapacityFromMachineInfo(info) + } else { + return nil, err + } + cgroupRoot := nodeConfig.CgroupRoot + cgroupManager := NewCgroupManager(subsystems, nodeConfig.CgroupDriver) // Check if Cgroup-root actually exists on the node if nodeConfig.CgroupsPerQOS { // this does default to / when enabled, but this tests against regressions. @@ -216,17 +237,24 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I // of note, we always use the cgroupfs driver when performing this check since // the input is provided in that format. // this is important because we do not want any name conversion to occur. - cgroupManager := NewCgroupManager(subsystems, "cgroupfs") - if !cgroupManager.Exists(CgroupName(nodeConfig.CgroupRoot)) { - return nil, fmt.Errorf("invalid configuration: cgroup-root doesn't exist: %v", err) + if !cgroupManager.Exists(CgroupName(cgroupRoot)) { + return nil, fmt.Errorf("invalid configuration: cgroup-root %q doesn't exist: %v", cgroupRoot, err) } - glog.Infof("container manager verified cgroup-root exists: %v", nodeConfig.CgroupRoot) + glog.Infof("container manager verified user specified cgroup-root exists: %v", cgroupRoot) + // Include the the top level cgroup for enforcing node allocatable into cgroup-root. + // This way, all sub modules can avoid having to understand the concept of node allocatable. + cgroupRoot = path.Join(cgroupRoot, defaultNodeAllocatableCgroupName) } + glog.Infof("Creating Container Manager object based on Node Config: %+v", nodeConfig) return &containerManagerImpl{ cadvisorInterface: cadvisorInterface, mountUtil: mountUtil, NodeConfig: nodeConfig, subsystems: subsystems, + cgroupManager: cgroupManager, + capacity: capacity, + cgroupRoot: cgroupRoot, + recorder: recorder, }, nil } @@ -239,11 +267,11 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager { qosContainersInfo: cm.qosContainers, nodeInfo: cm.nodeInfo, subsystems: cm.subsystems, - cgroupManager: NewCgroupManager(cm.subsystems, cm.NodeConfig.CgroupDriver), + cgroupManager: cm.cgroupManager, } } return &podContainerManagerNoop{ - cgroupRoot: CgroupName(cm.NodeConfig.CgroupRoot), + cgroupRoot: CgroupName(cm.cgroupRoot), } } @@ -373,13 +401,21 @@ func (cm *containerManagerImpl) setupNode() error { // Setup top level qos containers only if CgroupsPerQOS flag is specified as true if cm.NodeConfig.CgroupsPerQOS { - qosContainersInfo, err := InitQOS(cm.NodeConfig.CgroupDriver, cm.NodeConfig.CgroupRoot, cm.subsystems) + if err := cm.createNodeAllocatableCgroups(); err != nil { + return err + } + qosContainersInfo, err := InitQOS(cm.NodeConfig.CgroupDriver, cm.cgroupRoot, cm.subsystems) if err != nil { return fmt.Errorf("failed to initialise top level QOS containers: %v", err) } cm.qosContainers = qosContainersInfo } + // Enforce Node Allocatable (if required) + if err := cm.enforceNodeAllocatableCgroups(); err != nil { + return err + } + systemContainers := []*systemContainer{} if cm.ContainerRuntime == "docker" { dockerVersion := getDockerVersion(cm.cadvisorInterface) @@ -405,11 +441,7 @@ func (cm *containerManagerImpl) setupNode() error { }) } else if cm.RuntimeCgroupsName != "" { cont := newSystemCgroups(cm.RuntimeCgroupsName) - var capacity = v1.ResourceList{} - if info, err := cm.cadvisorInterface.MachineInfo(); err == nil { - capacity = cadvisor.CapacityFromMachineInfo(info) - } - memoryLimit := (int64(capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100)) + memoryLimit := (int64(cm.capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100)) if memoryLimit < MinDockerMemoryLimit { glog.Warningf("Memory limit %d for container %s is too small, reset it to %d", memoryLimit, cm.RuntimeCgroupsName, MinDockerMemoryLimit) memoryLimit = MinDockerMemoryLimit @@ -544,6 +576,10 @@ func (cm *containerManagerImpl) Start(node *v1.Node) error { if err := cm.setupNode(); err != nil { return err } + // Ensure that node allocatable configuration is valid. + if err := cm.validateNodeAllocatable(); err != nil { + return err + } // Don't run a background thread if there are no ensureStateFuncs. hasEnsureStateFuncs := false for _, cont := range cm.systemContainers { @@ -823,3 +859,7 @@ func getDockerVersion(cadvisor cadvisor.Interface) *utilversion.Version { } return dockerVersion } + +func (m *containerManagerImpl) GetCapacity() v1.ResourceList { + return m.capacity +} diff --git a/pkg/kubelet/cm/container_manager_stub.go b/pkg/kubelet/cm/container_manager_stub.go index cce42afcd9..fb38b217dd 100644 --- a/pkg/kubelet/cm/container_manager_stub.go +++ b/pkg/kubelet/cm/container_manager_stub.go @@ -50,6 +50,10 @@ func (cm *containerManagerStub) Status() Status { return Status{} } +func (cm *containerManagerStub) GetNodeAllocatableReservation() v1.ResourceList { + return nil +} + func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager { return &podContainerManagerStub{} } diff --git a/pkg/kubelet/cm/container_manager_unsupported.go b/pkg/kubelet/cm/container_manager_unsupported.go index 5199f6d483..b8dcfb0d31 100644 --- a/pkg/kubelet/cm/container_manager_unsupported.go +++ b/pkg/kubelet/cm/container_manager_unsupported.go @@ -21,6 +21,7 @@ package cm import ( "fmt" + "k8s.io/client-go/tools/record" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/kubelet/cadvisor" "k8s.io/kubernetes/pkg/util/mount" @@ -55,10 +56,14 @@ func (cm *unsupportedContainerManager) Status() Status { return Status{} } +func (cm *unsupportedContainerManager) GetNodeAllocatableReservation() v1.ResourceList { + return nil +} + func (cm *unsupportedContainerManager) NewPodContainerManager() PodContainerManager { return &unsupportedPodContainerManager{} } -func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig, failSwapOn bool) (ContainerManager, error) { +func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig, failSwapOn bool, recorder record.EventRecorder) (ContainerManager, error) { return &unsupportedContainerManager{}, nil } diff --git a/pkg/kubelet/cm/container_manager_windows.go b/pkg/kubelet/cm/container_manager_windows.go index 0578085ca4..8a90fc2de9 100644 --- a/pkg/kubelet/cm/container_manager_windows.go +++ b/pkg/kubelet/cm/container_manager_windows.go @@ -21,6 +21,7 @@ package cm import ( "github.com/golang/glog" + "k8s.io/client-go/tools/record" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/kubelet/cadvisor" "k8s.io/kubernetes/pkg/util/mount" @@ -37,6 +38,6 @@ func (cm *containerManagerImpl) Start(_ *v1.Node) error { return nil } -func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool) (ContainerManager, error) { +func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, recorder record.EventRecorder) (ContainerManager, error) { return &containerManagerImpl{}, nil } diff --git a/pkg/kubelet/cm/node_allocatable.go b/pkg/kubelet/cm/node_allocatable.go deleted file mode 100644 index f9b9914744..0000000000 --- a/pkg/kubelet/cm/node_allocatable.go +++ /dev/null @@ -1,164 +0,0 @@ -// +build linux - -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package cm - -import ( - "fmt" - - "github.com/golang/glog" - - "k8s.io/kubernetes/pkg/api/v1" - evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" -) - -const ( - defaultNodeAllocatableCgroupName = "/kubepods" - nodeAllocatableEnforcementKey = "pods" - systemReservedEnforcementKey = "system-reserved" - kubeReservedEnforcementKey = "kube-reserved" -) - -func createNodeAllocatableCgroups(nc NodeAllocatableConfig, nodeAllocatable v1.ResourceList, cgroupManager CgroupManager) error { - cgroupConfig := &CgroupConfig{ - Name: CgroupName(defaultNodeAllocatableCgroupName), - } - if err := cgroupManager.Create(cgroupConfig); err != nil { - glog.Errorf("Failed to create %q cgroup and apply limits") - return err - } - return nil -} - -// Enforce Node Allocatable Cgroup settings. -func enforceNodeAllocatableCgroups(nc NodeAllocatableConfig, nodeAllocatable v1.ResourceList, cgroupManager CgroupManager) error { - glog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc) - glog.V(4).Infof("Node Allocatable resources: %+v", nodeAllocatable) - // Create top level cgroups for all pods if necessary. - if nc.EnforceNodeAllocatable.Has(nodeAllocatableEnforcementKey) { - cgroupConfig := &CgroupConfig{ - Name: CgroupName(defaultNodeAllocatableCgroupName), - ResourceParameters: getCgroupConfig(nodeAllocatable), - } - glog.V(4).Infof("Updating Node Allocatable cgroup with %d cpu shares and %d bytes of memory", cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory) - if err := cgroupManager.Update(cgroupConfig); err != nil { - glog.Errorf("Failed to create %q cgroup and apply limits") - return err - } - } - // Now apply kube reserved and system reserved limits if required. - if nc.EnforceNodeAllocatable.Has(systemReservedEnforcementKey) { - glog.V(2).Infof("Enforcing system reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved) - if err := enforceExistingCgroup(cgroupManager, nc.SystemReservedCgroupName, nc.SystemReserved); err != nil { - return fmt.Errorf("failed to enforce System Reserved Cgroup Limits: %v", err) - } - } - if nc.EnforceNodeAllocatable.Has(kubeReservedEnforcementKey) { - glog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved) - if err := enforceExistingCgroup(cgroupManager, nc.KubeReservedCgroupName, nc.KubeReserved); err != nil { - return fmt.Errorf("failed to enforce Kube Reserved Cgroup Limits: %v", err) - } - } - return nil -} - -func enforceExistingCgroup(cgroupManager CgroupManager, cName string, rl v1.ResourceList) error { - cgroupConfig := &CgroupConfig{ - Name: CgroupName(cName), - ResourceParameters: getCgroupConfig(rl), - } - glog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory) - if !cgroupManager.Exists(cgroupConfig.Name) { - return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name) - } - if err := cgroupManager.Update(cgroupConfig); err != nil { - return err - } - return nil -} - -func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { - // TODO(vishh): Set CPU Quota if necessary. - if rl == nil { - return nil - } - var rc ResourceConfig - if q, exists := rl[v1.ResourceMemory]; exists { - // Memory is defined in bytes. - val := q.Value() - rc.Memory = &val - } - if q, exists := rl[v1.ResourceCPU]; exists { - // CPU is defined in milli-cores. - val := MilliCPUToShares(q.MilliValue()) - rc.CpuShares = &val - } - return &rc -} - -func (cm *containerManagerImpl) getNodeAllocatableInternal(includeHardEviction bool) v1.ResourceList { - var evictionReservation v1.ResourceList - if includeHardEviction { - evictionReservation = hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity) - } - result := make(v1.ResourceList) - for k, v := range cm.capacity { - value := *(v.Copy()) - if cm.NodeConfig.SystemReserved != nil { - value.Sub(cm.NodeConfig.SystemReserved[k]) - } - if cm.NodeConfig.KubeReserved != nil { - value.Sub(cm.NodeConfig.KubeReserved[k]) - } - if evictionReservation != nil { - value.Sub(evictionReservation[k]) - } - if value.Sign() < 0 { - // Negative Allocatable resources don't make sense. - value.Set(0) - } - result[k] = value - } - return result - -} - -// GetNodeAllocatable returns amount of compute resource available for pods. -func (cm *containerManagerImpl) GetNodeAllocatable() v1.ResourceList { - return cm.getNodeAllocatableInternal(!cm.NodeConfig.IgnoreHardEvictionThreshold) -} - -// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds. -func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList { - if len(thresholds) == 0 { - return nil - } - ret := v1.ResourceList{} - for _, threshold := range thresholds { - if threshold.Operator != evictionapi.OpLessThan { - continue - } - switch threshold.Signal { - case evictionapi.SignalMemoryAvailable: - memoryCapacity := capacity[v1.ResourceMemory] - value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity) - ret[v1.ResourceMemory] = *value - } - } - return ret -} diff --git a/pkg/kubelet/cm/node_container_manager.go b/pkg/kubelet/cm/node_container_manager.go new file mode 100644 index 0000000000..29b2fd1122 --- /dev/null +++ b/pkg/kubelet/cm/node_container_manager.go @@ -0,0 +1,229 @@ +// +build linux + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "fmt" + "strings" + "time" + + "github.com/golang/glog" + + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/kubelet/events" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" +) + +const ( + defaultNodeAllocatableCgroupName = "kubepods" +) + +func (cm *containerManagerImpl) createNodeAllocatableCgroups() error { + cgroupConfig := &CgroupConfig{ + Name: CgroupName(cm.cgroupRoot), + // The default limits for cpu shares can be very low which can lead to CPU starvation for pods. + ResourceParameters: getCgroupConfig(cm.capacity), + } + if cm.cgroupManager.Exists(cgroupConfig.Name) { + return nil + } + if err := cm.cgroupManager.Create(cgroupConfig); err != nil { + glog.Errorf("Failed to create %q cgroup", cm.cgroupRoot) + return err + } + return nil +} + +// Enforce Node Allocatable Cgroup settings. +func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error { + nc := cm.NodeConfig.NodeAllocatableConfig + + // We need to update limits on node allocatable cgroup no matter what because + // default cpu shares on cgroups are low and can cause cpu starvation. + nodeAllocatable := cm.capacity + // Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable. + if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(NodeAllocatableEnforcementKey) { + nodeAllocatable = cm.getNodeAllocatableAbsolute() + } + + glog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc) + + cgroupConfig := &CgroupConfig{ + Name: CgroupName(cm.cgroupRoot), + ResourceParameters: getCgroupConfig(nodeAllocatable), + } + // If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value, + // existing memory usage across pods might be higher that current Node Allocatable Memory Limits. + // Pod Evictions are expected to bring down memory usage to below Node Allocatable limits. + // Until evictions happen retry cgroup updates. + // Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low. + if cm.cgroupRoot != "/" { + go func() { + for { + err := cm.cgroupManager.Update(cgroupConfig) + if err == nil { + cm.recorder.Event(cm.nodeInfo, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated Node Allocatable limit across pods") + return + } + message := fmt.Sprintf("Failed to update Node Allocatable Limits %q: %v", cm.cgroupRoot, err) + cm.recorder.Event(cm.nodeInfo, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message) + time.Sleep(time.Minute) + } + }() + } + // Now apply kube reserved and system reserved limits if required. + if nc.EnforceNodeAllocatable.Has(SystemReservedEnforcementKey) { + glog.V(2).Infof("Enforcing System reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved) + if err := enforceExistingCgroup(cm.cgroupManager, nc.SystemReservedCgroupName, nc.SystemReserved); err != nil { + message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err) + cm.recorder.Event(cm.nodeInfo, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message) + return fmt.Errorf(message) + } + cm.recorder.Eventf(cm.nodeInfo, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName) + } + if nc.EnforceNodeAllocatable.Has(KubeReservedEnforcementKey) { + glog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved) + if err := enforceExistingCgroup(cm.cgroupManager, nc.KubeReservedCgroupName, nc.KubeReserved); err != nil { + message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err) + cm.recorder.Event(cm.nodeInfo, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message) + return fmt.Errorf(message) + } + cm.recorder.Eventf(cm.nodeInfo, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName) + } + return nil +} + +// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface. +func enforceExistingCgroup(cgroupManager CgroupManager, cName string, rl v1.ResourceList) error { + cgroupConfig := &CgroupConfig{ + Name: CgroupName(cName), + ResourceParameters: getCgroupConfig(rl), + } + glog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory) + if !cgroupManager.Exists(cgroupConfig.Name) { + return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name) + } + if err := cgroupManager.Update(cgroupConfig); err != nil { + return err + } + return nil +} + +// Returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface. +func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { + // TODO(vishh): Set CPU Quota if necessary. + if rl == nil { + return nil + } + var rc ResourceConfig + if q, exists := rl[v1.ResourceMemory]; exists { + // Memory is defined in bytes. + val := q.Value() + rc.Memory = &val + } + if q, exists := rl[v1.ResourceCPU]; exists { + // CPU is defined in milli-cores. + val := MilliCPUToShares(q.MilliValue()) + rc.CpuShares = &val + } + return &rc +} + +// getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement. +// Note that not all resources that are available on the node are included in the returned list of resources. +// Returns a ResourceList. +func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { + result := make(v1.ResourceList) + for k, v := range cm.capacity { + value := *(v.Copy()) + if cm.NodeConfig.SystemReserved != nil { + value.Sub(cm.NodeConfig.SystemReserved[k]) + } + if cm.NodeConfig.KubeReserved != nil { + value.Sub(cm.NodeConfig.KubeReserved[k]) + } + if value.Sign() < 0 { + // Negative Allocatable resources don't make sense. + value.Set(0) + } + result[k] = value + } + return result + +} + +// GetNodeAllocatable returns amount of compute resource that have to be reserved on this node from scheduling. +func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList { + evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity) + result := make(v1.ResourceList) + for k := range cm.capacity { + value := resource.NewQuantity(0, resource.DecimalSI) + if cm.NodeConfig.SystemReserved != nil { + value.Add(cm.NodeConfig.SystemReserved[k]) + } + if cm.NodeConfig.KubeReserved != nil { + value.Add(cm.NodeConfig.KubeReserved[k]) + } + if evictionReservation != nil { + value.Add(evictionReservation[k]) + } + if !value.IsZero() { + result[k] = *value + } + } + return result +} + +// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds. +func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList { + if len(thresholds) == 0 { + return nil + } + ret := v1.ResourceList{} + for _, threshold := range thresholds { + if threshold.Operator != evictionapi.OpLessThan { + continue + } + switch threshold.Signal { + case evictionapi.SignalMemoryAvailable: + memoryCapacity := capacity[v1.ResourceMemory] + value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity) + ret[v1.ResourceMemory] = *value + } + } + return ret +} + +// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity. +// Returns error if the configuration is invalid, nil otherwise. +func (cm *containerManagerImpl) validateNodeAllocatable() error { + na := cm.GetNodeAllocatableReservation() + zeroValue := resource.MustParse("0") + var errors []string + for key, val := range na { + if val.Cmp(zeroValue) <= 0 { + errors = append(errors, fmt.Sprintf("Resource %q has an allocatable of %v", key, val)) + } + } + if len(errors) > 0 { + return fmt.Errorf("Invalid Node Allocatable configuration. %s", strings.Join(errors, " ")) + } + return nil +} diff --git a/pkg/kubelet/cm/node_container_manager_test.go b/pkg/kubelet/cm/node_container_manager_test.go new file mode 100644 index 0000000000..40517ad23d --- /dev/null +++ b/pkg/kubelet/cm/node_container_manager_test.go @@ -0,0 +1,305 @@ +// +build linux + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/kubernetes/pkg/api/v1" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" +) + +func TestNodeAllocatableReservationForScheduling(t *testing.T) { + memoryEvictionThreshold := resource.MustParse("100Mi") + testCases := []struct { + kubeReserved v1.ResourceList + systemReserved v1.ResourceList + expected v1.ResourceList + capacity v1.ResourceList + hardThreshold evictionapi.ThresholdValue + }{ + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("150m", "150Mi"), + }, + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + hardThreshold: evictionapi.ThresholdValue{ + Quantity: &memoryEvictionThreshold, + }, + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("150m", "250Mi"), + }, + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + hardThreshold: evictionapi.ThresholdValue{ + Percentage: 0.05, + }, + expected: getResourceList("150m", "694157320"), + }, + + { + kubeReserved: v1.ResourceList{}, + systemReserved: v1.ResourceList{}, + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("", ""), + }, + { + kubeReserved: getResourceList("", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("50m", "150Mi"), + }, + + { + kubeReserved: getResourceList("50m", "100Mi"), + systemReserved: getResourceList("", "50Mi"), + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("50m", "150Mi"), + }, + { + kubeReserved: getResourceList("", "100Mi"), + systemReserved: getResourceList("", "50Mi"), + capacity: getResourceList("10", ""), + expected: getResourceList("", "150Mi"), + }, + } + for idx, tc := range testCases { + nc := NodeConfig{ + NodeAllocatableConfig: NodeAllocatableConfig{ + KubeReserved: tc.kubeReserved, + SystemReserved: tc.systemReserved, + HardEvictionThresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: tc.hardThreshold, + }, + }, + }, + } + cm := &containerManagerImpl{ + NodeConfig: nc, + capacity: tc.capacity, + } + for k, v := range cm.GetNodeAllocatableReservation() { + expected, exists := tc.expected[k] + assert.True(t, exists, "test case %d expected resource %q", idx+1, k) + assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k) + } + } +} + +func TestNodeAllocatableWithNilHardThreshold(t *testing.T) { + nc := NodeConfig{ + NodeAllocatableConfig: NodeAllocatableConfig{ + KubeReserved: getResourceList("100m", "100Mi"), + SystemReserved: getResourceList("50m", "50Mi"), + }, + } + cm := &containerManagerImpl{ + NodeConfig: nc, + capacity: getResourceList("10", "10Gi"), + } + expected := getResourceList("150m", "150Mi") + for k, v := range cm.GetNodeAllocatableReservation() { + expected, exists := expected[k] + assert.True(t, exists) + assert.Equal(t, expected.MilliValue(), v.MilliValue(), "failed for resource %q", k) + } +} + +func TestNodeAllocatableForEnforcement(t *testing.T) { + memoryEvictionThreshold := resource.MustParse("100Mi") + testCases := []struct { + kubeReserved v1.ResourceList + systemReserved v1.ResourceList + capacity v1.ResourceList + expected v1.ResourceList + hardThreshold evictionapi.ThresholdValue + }{ + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("9850m", "10090Mi"), + }, + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + hardThreshold: evictionapi.ThresholdValue{ + Quantity: &memoryEvictionThreshold, + }, + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("9850m", "10090Mi"), + }, + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + hardThreshold: evictionapi.ThresholdValue{ + Percentage: 0.05, + }, + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("9850m", "10090Mi"), + }, + + { + kubeReserved: v1.ResourceList{}, + systemReserved: v1.ResourceList{}, + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("10", "10Gi"), + }, + { + kubeReserved: getResourceList("", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("9950m", "10090Mi"), + }, + + { + kubeReserved: getResourceList("50m", "100Mi"), + systemReserved: getResourceList("", "50Mi"), + capacity: getResourceList("10", "10Gi"), + expected: getResourceList("9950m", "10090Mi"), + }, + { + kubeReserved: getResourceList("", "100Mi"), + systemReserved: getResourceList("", "50Mi"), + capacity: getResourceList("10", ""), + expected: getResourceList("10", ""), + }, + } + for idx, tc := range testCases { + nc := NodeConfig{ + NodeAllocatableConfig: NodeAllocatableConfig{ + KubeReserved: tc.kubeReserved, + SystemReserved: tc.systemReserved, + HardEvictionThresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: tc.hardThreshold, + }, + }, + }, + } + cm := &containerManagerImpl{ + NodeConfig: nc, + capacity: tc.capacity, + } + for k, v := range cm.getNodeAllocatableAbsolute() { + expected, exists := tc.expected[k] + assert.True(t, exists) + assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k) + } + } +} + +func TestNodeAllocatableInputValidation(t *testing.T) { + memoryEvictionThreshold := resource.MustParse("100Mi") + highMemoryEvictionThreshold := resource.MustParse("2Gi") + testCases := []struct { + kubeReserved v1.ResourceList + systemReserved v1.ResourceList + capacity v1.ResourceList + hardThreshold evictionapi.ThresholdValue + invalidConfiguration bool + }{ + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + }, + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + hardThreshold: evictionapi.ThresholdValue{ + Quantity: &memoryEvictionThreshold, + }, + capacity: getResourceList("10", "10Gi"), + }, + { + kubeReserved: getResourceList("100m", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + hardThreshold: evictionapi.ThresholdValue{ + Percentage: 0.05, + }, + capacity: getResourceList("10", "10Gi"), + }, + { + kubeReserved: v1.ResourceList{}, + systemReserved: v1.ResourceList{}, + capacity: getResourceList("10", "10Gi"), + }, + { + kubeReserved: getResourceList("", "100Mi"), + systemReserved: getResourceList("50m", "50Mi"), + capacity: getResourceList("10", "10Gi"), + }, + { + kubeReserved: getResourceList("50m", "100Mi"), + systemReserved: getResourceList("", "50Mi"), + capacity: getResourceList("10", "10Gi"), + }, + { + kubeReserved: getResourceList("", "100Mi"), + systemReserved: getResourceList("", "50Mi"), + capacity: getResourceList("10", ""), + }, + { + kubeReserved: getResourceList("5", "10Gi"), + systemReserved: getResourceList("5", "10Gi"), + hardThreshold: evictionapi.ThresholdValue{ + Quantity: &highMemoryEvictionThreshold, + }, + capacity: getResourceList("10", "11Gi"), + invalidConfiguration: true, + }, + } + for _, tc := range testCases { + nc := NodeConfig{ + NodeAllocatableConfig: NodeAllocatableConfig{ + KubeReserved: tc.kubeReserved, + SystemReserved: tc.systemReserved, + HardEvictionThresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: tc.hardThreshold, + }, + }, + }, + } + cm := &containerManagerImpl{ + NodeConfig: nc, + capacity: tc.capacity, + } + if err := cm.validateNodeAllocatable(); err != nil && !tc.invalidConfiguration { + t.Logf("Expected valid node allocatable configuration: %v", err) + t.FailNow() + } + } +} diff --git a/pkg/kubelet/events/event.go b/pkg/kubelet/events/event.go index 8124962f95..84be072a26 100644 --- a/pkg/kubelet/events/event.go +++ b/pkg/kubelet/events/event.go @@ -41,28 +41,30 @@ const ( BackOffPullImage = "BackOff" // kubelet event reason list - NodeReady = "NodeReady" - NodeNotReady = "NodeNotReady" - NodeSchedulable = "NodeSchedulable" - NodeNotSchedulable = "NodeNotSchedulable" - StartingKubelet = "Starting" - KubeletSetupFailed = "KubeletSetupFailed" - FailedDetachVolume = "FailedDetachVolume" - FailedMountVolume = "FailedMount" - FailedUnMountVolume = "FailedUnMount" - SuccessfulDetachVolume = "SuccessfulDetachVolume" - SuccessfulMountVolume = "SuccessfulMountVolume" - SuccessfulUnMountVolume = "SuccessfulUnMountVolume" - HostPortConflict = "HostPortConflict" - NodeSelectorMismatching = "NodeSelectorMismatching" - InsufficientFreeCPU = "InsufficientFreeCPU" - InsufficientFreeMemory = "InsufficientFreeMemory" - OutOfDisk = "OutOfDisk" - HostNetworkNotSupported = "HostNetworkNotSupported" - UndefinedShaper = "NilShaper" - NodeRebooted = "Rebooted" - ContainerGCFailed = "ContainerGCFailed" - ImageGCFailed = "ImageGCFailed" + NodeReady = "NodeReady" + NodeNotReady = "NodeNotReady" + NodeSchedulable = "NodeSchedulable" + NodeNotSchedulable = "NodeNotSchedulable" + StartingKubelet = "Starting" + KubeletSetupFailed = "KubeletSetupFailed" + FailedDetachVolume = "FailedDetachVolume" + FailedMountVolume = "FailedMount" + FailedUnMountVolume = "FailedUnMount" + SuccessfulDetachVolume = "SuccessfulDetachVolume" + SuccessfulMountVolume = "SuccessfulMountVolume" + SuccessfulUnMountVolume = "SuccessfulUnMountVolume" + HostPortConflict = "HostPortConflict" + NodeSelectorMismatching = "NodeSelectorMismatching" + InsufficientFreeCPU = "InsufficientFreeCPU" + InsufficientFreeMemory = "InsufficientFreeMemory" + OutOfDisk = "OutOfDisk" + HostNetworkNotSupported = "HostNetworkNotSupported" + UndefinedShaper = "NilShaper" + NodeRebooted = "Rebooted" + ContainerGCFailed = "ContainerGCFailed" + ImageGCFailed = "ImageGCFailed" + FailedNodeAllocatableEnforcement = "FailedNodeAllocatableEnforcement" + SuccessfulNodeAllocatableEnforcement = "NodeAllocatableEnforced" // Image manager event reason list InvalidDiskCapacity = "InvalidDiskCapacity" diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 1d82152b3b..350e308345 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -33,6 +33,7 @@ go_test( "//pkg/api:go_default_library", "//pkg/api/v1:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", + "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", @@ -62,6 +63,7 @@ go_library( "//pkg/features:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/cm:go_default_library", + "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/qos:go_default_library", @@ -90,6 +92,9 @@ filegroup( filegroup( name = "all-srcs", - srcs = [":package-srcs"], + srcs = [ + ":package-srcs", + "//pkg/kubelet/eviction/api:all-srcs", + ], tags = ["automanaged"], ) diff --git a/pkg/kubelet/eviction/api/BUILD b/pkg/kubelet/eviction/api/BUILD new file mode 100644 index 0000000000..576526fcda --- /dev/null +++ b/pkg/kubelet/eviction/api/BUILD @@ -0,0 +1,28 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +load( + "@io_bazel_rules_go//go:def.bzl", + "go_library", +) + +go_library( + name = "go_default_library", + srcs = ["types.go"], + tags = ["automanaged"], + deps = ["//vendor:k8s.io/apimachinery/pkg/api/resource"], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) diff --git a/pkg/kubelet/eviction/api/types.go b/pkg/kubelet/eviction/api/types.go new file mode 100644 index 0000000000..306b4d787c --- /dev/null +++ b/pkg/kubelet/eviction/api/types.go @@ -0,0 +1,79 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package api + +import ( + "time" + + "k8s.io/apimachinery/pkg/api/resource" +) + +// Signal defines a signal that can trigger eviction of pods on a node. +type Signal string + +const ( + // SignalMemoryAvailable is memory available (i.e. capacity - workingSet), in bytes. + SignalMemoryAvailable Signal = "memory.available" + // SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc. + SignalNodeFsAvailable Signal = "nodefs.available" + // SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc. + SignalNodeFsInodesFree Signal = "nodefs.inodesFree" + // SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers. + SignalImageFsAvailable Signal = "imagefs.available" + // SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers. + SignalImageFsInodesFree Signal = "imagefs.inodesFree" +) + +// ThresholdOperator is the operator used to express a Threshold. +type ThresholdOperator string + +const ( + // OpLessThan is the operator that expresses a less than operator. + OpLessThan ThresholdOperator = "LessThan" +) + +// ThresholdValue is a value holder that abstracts literal versus percentage based quantity +type ThresholdValue struct { + // The following fields are exclusive. Only the topmost non-zero field is used. + + // Quantity is a quantity associated with the signal that is evaluated against the specified operator. + Quantity *resource.Quantity + // Percentage represents the usage percentage over the total resource that is evaluated against the specified operator. + Percentage float32 +} + +// Threshold defines a metric for when eviction should occur. +type Threshold struct { + // Signal defines the entity that was measured. + Signal Signal + // Operator represents a relationship of a signal to a value. + Operator ThresholdOperator + // Value is the threshold the resource is evaluated against. + Value ThresholdValue + // GracePeriod represents the amount of time that a threshold must be met before eviction is triggered. + GracePeriod time.Duration + // MinReclaim represents the minimum amount of resource to reclaim if the threshold is met. + MinReclaim *ThresholdValue +} + +// GetThresholdQuantity returns the expected quantity value for a thresholdValue +func GetThresholdQuantity(value ThresholdValue, capacity *resource.Quantity) *resource.Quantity { + if value.Quantity != nil { + return value.Quantity.Copy() + } + return resource.NewQuantity(int64(float64(capacity.Value())*float64(value.Percentage)), resource.BinarySI) +} diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index df52aa7deb..7240487434 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -33,6 +33,7 @@ import ( "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/lifecycle" kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" @@ -66,7 +67,7 @@ type managerImpl struct { // records when a threshold was first observed thresholdsFirstObservedAt thresholdsObservedAt // records the set of thresholds that have been met (including graceperiod) but not yet resolved - thresholdsMet []Threshold + thresholdsMet []evictionapi.Threshold // resourceToRankFunc maps a resource to ranking function for that resource. resourceToRankFunc map[v1.ResourceName]rankFunc // resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource. @@ -152,12 +153,12 @@ func (m *managerImpl) IsUnderDiskPressure() bool { return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure) } -func startMemoryThresholdNotifier(thresholds []Threshold, observations signalObservations, hard bool, handler thresholdNotifierHandlerFunc) error { +func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observations signalObservations, hard bool, handler thresholdNotifierHandlerFunc) error { for _, threshold := range thresholds { - if threshold.Signal != SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) { + if threshold.Signal != evictionapi.SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) { continue } - observed, found := observations[SignalMemoryAvailable] + observed, found := observations[evictionapi.SignalMemoryAvailable] if !found { continue } @@ -171,7 +172,7 @@ func startMemoryThresholdNotifier(thresholds []Threshold, observations signalObs return fmt.Errorf("memory cgroup mount point not found") } attribute := "memory.usage_in_bytes" - quantity := getThresholdQuantity(threshold.Value, observed.capacity) + quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) usageThreshold := resource.NewQuantity(observed.capacity.Value(), resource.DecimalSI) usageThreshold.Sub(*quantity) description := fmt.Sprintf("<%s available", formatThresholdValue(threshold.Value)) diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 02097770a6..8a3f9c8437 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -29,6 +29,7 @@ import ( kubeapi "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/lifecycle" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -180,18 +181,18 @@ func TestMemoryPressure(t *testing.T) { config := Config{ MaxPodGracePeriodSeconds: 5, PressureTransitionPeriod: time.Minute * 5, - Thresholds: []Threshold{ + Thresholds: []evictionapi.Threshold{ { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, GracePeriod: time.Minute * 2, @@ -397,18 +398,18 @@ func TestDiskPressureNodeFs(t *testing.T) { config := Config{ MaxPodGracePeriodSeconds: 5, PressureTransitionPeriod: time.Minute * 5, - Thresholds: []Threshold{ + Thresholds: []evictionapi.Threshold{ { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, GracePeriod: time.Minute * 2, @@ -594,14 +595,14 @@ func TestMinReclaim(t *testing.T) { config := Config{ MaxPodGracePeriodSeconds: 5, PressureTransitionPeriod: time.Minute * 5, - Thresholds: []Threshold{ + Thresholds: []evictionapi.Threshold{ { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("500Mi"), }, }, @@ -733,14 +734,14 @@ func TestNodeReclaimFuncs(t *testing.T) { config := Config{ MaxPodGracePeriodSeconds: 5, PressureTransitionPeriod: time.Minute * 5, - Thresholds: []Threshold{ + Thresholds: []evictionapi.Threshold{ { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("500Mi"), }, }, @@ -925,18 +926,18 @@ func TestInodePressureNodeFsInodes(t *testing.T) { config := Config{ MaxPodGracePeriodSeconds: 5, PressureTransitionPeriod: time.Minute * 5, - Thresholds: []Threshold{ + Thresholds: []evictionapi.Threshold{ { - Signal: SignalNodeFsInodesFree, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Mi"), }, }, { - Signal: SignalNodeFsInodesFree, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Mi"), }, GracePeriod: time.Minute * 2, @@ -1127,18 +1128,18 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { config := Config{ MaxPodGracePeriodSeconds: 5, PressureTransitionPeriod: time.Minute * 5, - Thresholds: []Threshold{ + Thresholds: []evictionapi.Threshold{ { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, GracePeriod: time.Minute * 2, diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 7d66878b44..26b4fbbc94 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -29,6 +29,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" "k8s.io/kubernetes/pkg/quota/evaluator/core" @@ -56,44 +57,44 @@ const ( var ( // signalToNodeCondition maps a signal to the node condition to report if threshold is met. - signalToNodeCondition map[Signal]v1.NodeConditionType + signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType // signalToResource maps a Signal to its associated Resource. - signalToResource map[Signal]v1.ResourceName + signalToResource map[evictionapi.Signal]v1.ResourceName // resourceToSignal maps a Resource to its associated Signal - resourceToSignal map[v1.ResourceName]Signal + resourceToSignal map[v1.ResourceName]evictionapi.Signal ) func init() { // map eviction signals to node conditions - signalToNodeCondition = map[Signal]v1.NodeConditionType{} - signalToNodeCondition[SignalMemoryAvailable] = v1.NodeMemoryPressure - signalToNodeCondition[SignalImageFsAvailable] = v1.NodeDiskPressure - signalToNodeCondition[SignalNodeFsAvailable] = v1.NodeDiskPressure - signalToNodeCondition[SignalImageFsInodesFree] = v1.NodeDiskPressure - signalToNodeCondition[SignalNodeFsInodesFree] = v1.NodeDiskPressure + signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{} + signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure + signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure + signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure + signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure + signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure // map signals to resources (and vice-versa) - signalToResource = map[Signal]v1.ResourceName{} - signalToResource[SignalMemoryAvailable] = v1.ResourceMemory - signalToResource[SignalImageFsAvailable] = resourceImageFs - signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes - signalToResource[SignalNodeFsAvailable] = resourceNodeFs - signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes - resourceToSignal = map[v1.ResourceName]Signal{} + signalToResource = map[evictionapi.Signal]v1.ResourceName{} + signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory + signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs + signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes + signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs + signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes + resourceToSignal = map[v1.ResourceName]evictionapi.Signal{} for key, value := range signalToResource { resourceToSignal[value] = key } } // validSignal returns true if the signal is supported. -func validSignal(signal Signal) bool { +func validSignal(signal evictionapi.Signal) bool { _, found := signalToResource[signal] return found } // ParseThresholdConfig parses the flags for thresholds. -func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]Threshold, error) { - results := []Threshold{} +func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) { + results := []evictionapi.Threshold{} hardThresholds, err := parseThresholdStatements(evictionHard) if err != nil { @@ -134,11 +135,11 @@ func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, e } // parseThresholdStatements parses the input statements into a list of Threshold objects. -func parseThresholdStatements(expr string) ([]Threshold, error) { +func parseThresholdStatements(expr string) ([]evictionapi.Threshold, error) { if len(expr) == 0 { return nil, nil } - results := []Threshold{} + results := []evictionapi.Threshold{} statements := strings.Split(expr, ",") signalsFound := sets.NewString() for _, statement := range statements { @@ -156,12 +157,12 @@ func parseThresholdStatements(expr string) ([]Threshold, error) { } // parseThresholdStatement parses a threshold statement. -func parseThresholdStatement(statement string) (Threshold, error) { - tokens2Operator := map[string]ThresholdOperator{ - "<": OpLessThan, +func parseThresholdStatement(statement string) (evictionapi.Threshold, error) { + tokens2Operator := map[string]evictionapi.ThresholdOperator{ + "<": evictionapi.OpLessThan, } var ( - operator ThresholdOperator + operator evictionapi.ThresholdOperator parts []string ) for token := range tokens2Operator { @@ -173,41 +174,41 @@ func parseThresholdStatement(statement string) (Threshold, error) { } } if len(operator) == 0 || len(parts) != 2 { - return Threshold{}, fmt.Errorf("invalid eviction threshold syntax %v, expected ", statement) + return evictionapi.Threshold{}, fmt.Errorf("invalid eviction threshold syntax %v, expected ", statement) } - signal := Signal(parts[0]) + signal := evictionapi.Signal(parts[0]) if !validSignal(signal) { - return Threshold{}, fmt.Errorf(unsupportedEvictionSignal, signal) + return evictionapi.Threshold{}, fmt.Errorf(unsupportedEvictionSignal, signal) } quantityValue := parts[1] if strings.HasSuffix(quantityValue, "%") { percentage, err := parsePercentage(quantityValue) if err != nil { - return Threshold{}, err + return evictionapi.Threshold{}, err } if percentage <= 0 { - return Threshold{}, fmt.Errorf("eviction percentage threshold %v must be positive: %s", signal, quantityValue) + return evictionapi.Threshold{}, fmt.Errorf("eviction percentage threshold %v must be positive: %s", signal, quantityValue) } - return Threshold{ + return evictionapi.Threshold{ Signal: signal, Operator: operator, - Value: ThresholdValue{ + Value: evictionapi.ThresholdValue{ Percentage: percentage, }, }, nil } quantity, err := resource.ParseQuantity(quantityValue) if err != nil { - return Threshold{}, err + return evictionapi.Threshold{}, err } if quantity.Sign() < 0 || quantity.IsZero() { - return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) + return evictionapi.Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) } - return Threshold{ + return evictionapi.Threshold{ Signal: signal, Operator: operator, - Value: ThresholdValue{ + Value: evictionapi.ThresholdValue{ Quantity: &quantity, }, }, nil @@ -223,18 +224,18 @@ func parsePercentage(input string) (float32, error) { } // parseGracePeriods parses the grace period statements -func parseGracePeriods(expr string) (map[Signal]time.Duration, error) { +func parseGracePeriods(expr string) (map[evictionapi.Signal]time.Duration, error) { if len(expr) == 0 { return nil, nil } - results := map[Signal]time.Duration{} + results := map[evictionapi.Signal]time.Duration{} statements := strings.Split(expr, ",") for _, statement := range statements { parts := strings.Split(statement, "=") if len(parts) != 2 { return nil, fmt.Errorf("invalid eviction grace period syntax %v, expected =", statement) } - signal := Signal(parts[0]) + signal := evictionapi.Signal(parts[0]) if !validSignal(signal) { return nil, fmt.Errorf(unsupportedEvictionSignal, signal) } @@ -257,18 +258,18 @@ func parseGracePeriods(expr string) (map[Signal]time.Duration, error) { } // parseMinimumReclaims parses the minimum reclaim statements -func parseMinimumReclaims(expr string) (map[Signal]ThresholdValue, error) { +func parseMinimumReclaims(expr string) (map[evictionapi.Signal]evictionapi.ThresholdValue, error) { if len(expr) == 0 { return nil, nil } - results := map[Signal]ThresholdValue{} + results := map[evictionapi.Signal]evictionapi.ThresholdValue{} statements := strings.Split(expr, ",") for _, statement := range statements { parts := strings.Split(statement, "=") if len(parts) != 2 { return nil, fmt.Errorf("invalid eviction minimum reclaim syntax: %v, expected =", statement) } - signal := Signal(parts[0]) + signal := evictionapi.Signal(parts[0]) if !validSignal(signal) { return nil, fmt.Errorf(unsupportedEvictionSignal, signal) } @@ -286,7 +287,7 @@ func parseMinimumReclaims(expr string) (map[Signal]ThresholdValue, error) { if _, found := results[signal]; found { return nil, fmt.Errorf("duplicate eviction minimum reclaim specified for %v", signal) } - results[signal] = ThresholdValue{ + results[signal] = evictionapi.ThresholdValue{ Percentage: percentage, } continue @@ -302,7 +303,7 @@ func parseMinimumReclaims(expr string) (map[Signal]ThresholdValue, error) { if err != nil { return nil, err } - results[signal] = ThresholdValue{ + results[signal] = evictionapi.ThresholdValue{ Quantity: &quantity, } } @@ -402,12 +403,12 @@ func podMemoryUsage(podStats statsapi.PodStats) (v1.ResourceList, error) { } // formatThreshold formats a threshold for logging. -func formatThreshold(threshold Threshold) string { - return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, formatThresholdValue(threshold.Value), threshold.Operator, threshold.GracePeriod) +func formatThreshold(threshold evictionapi.Threshold) string { + return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, threshold.Operator, evictionapi.ThresholdValue(threshold.Value), threshold.GracePeriod) } -// formatThresholdValue formats a thresholdValue for logging. -func formatThresholdValue(value ThresholdValue) string { +// formatevictionapi.ThresholdValue formats a thresholdValue for logging. +func formatThresholdValue(value evictionapi.ThresholdValue) string { if value.Quantity != nil { return value.Quantity.String() } @@ -622,7 +623,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv result := signalObservations{} if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { - result[SignalMemoryAvailable] = signalObservation{ + result[evictionapi.SignalMemoryAvailable] = signalObservation{ available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), time: memory.Time, @@ -630,14 +631,14 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv } if nodeFs := summary.Node.Fs; nodeFs != nil { if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { - result[SignalNodeFsAvailable] = signalObservation{ + result[evictionapi.SignalNodeFsAvailable] = signalObservation{ available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), // TODO: add timestamp to stat (see memory stat) } } if nodeFs.InodesFree != nil && nodeFs.Inodes != nil { - result[SignalNodeFsInodesFree] = signalObservation{ + result[evictionapi.SignalNodeFsInodesFree] = signalObservation{ available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI), capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI), // TODO: add timestamp to stat (see memory stat) @@ -647,13 +648,13 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv if summary.Node.Runtime != nil { if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil { if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { - result[SignalImageFsAvailable] = signalObservation{ + result[evictionapi.SignalImageFsAvailable] = signalObservation{ available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), // TODO: add timestamp to stat (see memory stat) } if imageFs.InodesFree != nil && imageFs.Inodes != nil { - result[SignalImageFsInodesFree] = signalObservation{ + result[evictionapi.SignalImageFsInodesFree] = signalObservation{ available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI), capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI), // TODO: add timestamp to stat (see memory stat) @@ -666,8 +667,8 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv } // thresholdsMet returns the set of thresholds that were met independent of grace period -func thresholdsMet(thresholds []Threshold, observations signalObservations, enforceMinReclaim bool) []Threshold { - results := []Threshold{} +func thresholdsMet(thresholds []evictionapi.Threshold, observations signalObservations, enforceMinReclaim bool) []evictionapi.Threshold { + results := []evictionapi.Threshold{} for i := range thresholds { threshold := thresholds[i] observed, found := observations[threshold.Signal] @@ -677,14 +678,14 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations, enfo } // determine if we have met the specified threshold thresholdMet := false - quantity := getThresholdQuantity(threshold.Value, observed.capacity) + quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) // if enforceMinReclaim is specified, we compare relative to value - minreclaim if enforceMinReclaim && threshold.MinReclaim != nil { - quantity.Add(*getThresholdQuantity(*threshold.MinReclaim, observed.capacity)) + quantity.Add(*evictionapi.GetThresholdQuantity(*threshold.MinReclaim, observed.capacity)) } thresholdResult := quantity.Cmp(*observed.available) switch threshold.Operator { - case OpLessThan: + case evictionapi.OpLessThan: thresholdMet = thresholdResult > 0 } if thresholdMet { @@ -704,12 +705,12 @@ func debugLogObservations(logPrefix string, observations signalObservations) { } } -func debugLogThresholdsWithObservation(logPrefix string, thresholds []Threshold, observations signalObservations) { +func debugLogThresholdsWithObservation(logPrefix string, thresholds []evictionapi.Threshold, observations signalObservations) { for i := range thresholds { threshold := thresholds[i] observed, found := observations[threshold.Signal] if found { - quantity := getThresholdQuantity(threshold.Value, observed.capacity) + quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) glog.V(3).Infof("eviction manager: %v: threshold [signal=%v, quantity=%v] observed %v", logPrefix, threshold.Signal, quantity, observed.available) } else { glog.V(3).Infof("eviction manager: %v: threshold [signal=%v] had no observation", logPrefix, threshold.Signal) @@ -717,8 +718,8 @@ func debugLogThresholdsWithObservation(logPrefix string, thresholds []Threshold, } } -func thresholdsUpdatedStats(thresholds []Threshold, observations, lastObservations signalObservations) []Threshold { - results := []Threshold{} +func thresholdsUpdatedStats(thresholds []evictionapi.Threshold, observations, lastObservations signalObservations) []evictionapi.Threshold { + results := []evictionapi.Threshold{} for i := range thresholds { threshold := thresholds[i] observed, found := observations[threshold.Signal] @@ -734,16 +735,8 @@ func thresholdsUpdatedStats(thresholds []Threshold, observations, lastObservatio return results } -// getThresholdQuantity returns the expected quantity value for a thresholdValue -func getThresholdQuantity(value ThresholdValue, capacity *resource.Quantity) *resource.Quantity { - if value.Quantity != nil { - return value.Quantity.Copy() - } - return resource.NewQuantity(int64(float64(capacity.Value())*float64(value.Percentage)), resource.BinarySI) -} - // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met. -func thresholdsFirstObservedAt(thresholds []Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt { +func thresholdsFirstObservedAt(thresholds []evictionapi.Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt { results := thresholdsObservedAt{} for i := range thresholds { observedAt, found := lastObservedAt[thresholds[i]] @@ -756,8 +749,8 @@ func thresholdsFirstObservedAt(thresholds []Threshold, lastObservedAt thresholds } // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period -func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []Threshold { - results := []Threshold{} +func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []evictionapi.Threshold { + results := []evictionapi.Threshold{} for threshold, at := range observedAt { duration := now.Sub(at) if duration < threshold.GracePeriod { @@ -770,7 +763,7 @@ func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) [] } // nodeConditions returns the set of node conditions associated with a threshold -func nodeConditions(thresholds []Threshold) []v1.NodeConditionType { +func nodeConditions(thresholds []evictionapi.Threshold) []v1.NodeConditionType { results := []v1.NodeConditionType{} for _, threshold := range thresholds { if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found { @@ -832,7 +825,7 @@ func hasNodeCondition(inputs []v1.NodeConditionType, item v1.NodeConditionType) } // mergeThresholds will merge both threshold lists eliminating duplicates. -func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold { +func mergeThresholds(inputsA []evictionapi.Threshold, inputsB []evictionapi.Threshold) []evictionapi.Threshold { results := inputsA for _, threshold := range inputsB { if !hasThreshold(results, threshold) { @@ -843,7 +836,7 @@ func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold { } // hasThreshold returns true if the threshold is in the input list -func hasThreshold(inputs []Threshold, item Threshold) bool { +func hasThreshold(inputs []evictionapi.Threshold, item evictionapi.Threshold) bool { for _, input := range inputs { if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && compareThresholdValue(input.Value, item.Value) { return true @@ -852,8 +845,8 @@ func hasThreshold(inputs []Threshold, item Threshold) bool { return false } -// compareThresholdValue returns true if the two thresholdValue objects are logically the same -func compareThresholdValue(a ThresholdValue, b ThresholdValue) bool { +// compareevictionapi.ThresholdValue returns true if the two thresholdValue objects are logically the same +func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.ThresholdValue) bool { if a.Quantity != nil { if b.Quantity == nil { return false @@ -867,7 +860,7 @@ func compareThresholdValue(a ThresholdValue, b ThresholdValue) bool { } // getStarvedResources returns the set of resources that are starved based on thresholds met. -func getStarvedResources(thresholds []Threshold) []v1.ResourceName { +func getStarvedResources(thresholds []evictionapi.Threshold) []v1.ResourceName { results := []v1.ResourceName{} for _, threshold := range thresholds { if starvedResource, found := signalToResource[threshold.Signal]; found { @@ -878,7 +871,7 @@ func getStarvedResources(thresholds []Threshold) []v1.ResourceName { } // isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds -func isSoftEvictionThresholds(thresholds []Threshold, starvedResource v1.ResourceName) bool { +func isSoftEvictionThresholds(thresholds []evictionapi.Threshold, starvedResource v1.ResourceName) bool { for _, threshold := range thresholds { if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource { continue @@ -891,7 +884,7 @@ func isSoftEvictionThresholds(thresholds []Threshold, starvedResource v1.Resourc } // isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds -func isHardEvictionThreshold(threshold Threshold) bool { +func isHardEvictionThreshold(threshold evictionapi.Threshold) bool { return threshold.GracePeriod == time.Duration(0) } diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 6b082d43a4..67d2c16ffe 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/quota" ) @@ -44,7 +45,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod string evictionMinReclaim string expectErr bool - expectThresholds []Threshold + expectThresholds []evictionapi.Threshold }{ "no values": { evictionHard: "", @@ -52,7 +53,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: false, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "all flag values": { evictionHard: "memory.available<150Mi", @@ -60,25 +61,25 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "memory.available=30s", evictionMinReclaim: "memory.available=0", expectErr: false, - expectThresholds: []Threshold{ + expectThresholds: []evictionapi.Threshold{ { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("150Mi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("0"), }, }, { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("300Mi"), }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("0"), }, }, @@ -90,25 +91,25 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "memory.available=30s", evictionMinReclaim: "memory.available=5%", expectErr: false, - expectThresholds: []Threshold{ + expectThresholds: []evictionapi.Threshold{ { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.1, }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.05, }, }, { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.3, }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.05, }, }, @@ -120,46 +121,46 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s", evictionMinReclaim: "imagefs.available=2Gi,nodefs.available=1Gi", expectErr: false, - expectThresholds: []Threshold{ + expectThresholds: []evictionapi.Threshold{ { - Signal: SignalImageFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("150Mi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, }, { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("100Mi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, { - Signal: SignalImageFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("300Mi"), }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, }, { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("200Mi"), }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, @@ -171,46 +172,46 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s", evictionMinReclaim: "imagefs.available=10%,nodefs.available=5%", expectErr: false, - expectThresholds: []Threshold{ + expectThresholds: []evictionapi.Threshold{ { - Signal: SignalImageFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.15, }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.1, }, }, { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.105, }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.05, }, }, { - Signal: SignalImageFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalImageFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.3, }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.1, }, }, { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.205, }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.05, }, }, @@ -222,46 +223,46 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s", evictionMinReclaim: "imagefs.inodesFree=2Gi,nodefs.inodesFree=1Gi", expectErr: false, - expectThresholds: []Threshold{ + expectThresholds: []evictionapi.Threshold{ { - Signal: SignalImageFsInodesFree, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("150Mi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, }, { - Signal: SignalNodeFsInodesFree, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("100Mi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, { - Signal: SignalImageFsInodesFree, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalImageFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("300Mi"), }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, }, { - Signal: SignalNodeFsInodesFree, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsInodesFree, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("200Mi"), }, GracePeriod: gracePeriod, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, }, @@ -273,7 +274,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "hard-signal-negative": { evictionHard: "memory.available<-150Mi", @@ -281,7 +282,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "hard-signal-negative-percentage": { evictionHard: "memory.available<-15%", @@ -289,7 +290,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "soft-signal-negative": { evictionHard: "", @@ -297,7 +298,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "duplicate-signal": { evictionHard: "memory.available<150Mi,memory.available<100Mi", @@ -305,7 +306,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "valid-and-invalid-signal": { evictionHard: "memory.available<150Mi,invalid.foo<150Mi", @@ -313,7 +314,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "soft-no-grace-period": { evictionHard: "", @@ -321,7 +322,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "soft-neg-grace-period": { evictionHard: "", @@ -329,7 +330,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "memory.available=-30s", evictionMinReclaim: "", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "neg-reclaim": { evictionHard: "", @@ -337,7 +338,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "memory.available=-300Mi", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, "duplicate-reclaim": { evictionHard: "", @@ -345,7 +346,7 @@ func TestParseThresholdConfig(t *testing.T) { evictionSoftGracePeriod: "", evictionMinReclaim: "memory.available=-300Mi,memory.available=-100Mi", expectErr: true, - expectThresholds: []Threshold{}, + expectThresholds: []evictionapi.Threshold{}, }, } for testName, testCase := range testCases { @@ -359,7 +360,7 @@ func TestParseThresholdConfig(t *testing.T) { } } -func thresholdsEqual(expected []Threshold, actual []Threshold) bool { +func thresholdsEqual(expected []evictionapi.Threshold, actual []evictionapi.Threshold) bool { if len(expected) != len(actual) { return false } @@ -388,7 +389,7 @@ func thresholdsEqual(expected []Threshold, actual []Threshold) bool { return true } -func thresholdEqual(a Threshold, b Threshold) bool { +func thresholdEqual(a evictionapi.Threshold, b evictionapi.Threshold) bool { return a.GracePeriod == b.GracePeriod && a.Operator == b.Operator && a.Signal == b.Signal && @@ -746,7 +747,7 @@ func TestMakeSignalObservations(t *testing.T) { if err != nil { t.Errorf("Unexpected err: %v", err) } - memQuantity, found := actualObservations[SignalMemoryAvailable] + memQuantity, found := actualObservations[evictionapi.SignalMemoryAvailable] if !found { t.Errorf("Expected available memory observation: %v", err) } @@ -756,7 +757,7 @@ func TestMakeSignalObservations(t *testing.T) { if expectedBytes := int64(nodeWorkingSetBytes + nodeAvailableBytes); memQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, memQuantity.capacity.Value()) } - nodeFsQuantity, found := actualObservations[SignalNodeFsAvailable] + nodeFsQuantity, found := actualObservations[evictionapi.SignalNodeFsAvailable] if !found { t.Errorf("Expected available nodefs observation: %v", err) } @@ -766,7 +767,7 @@ func TestMakeSignalObservations(t *testing.T) { if expectedBytes := int64(nodeFsCapacityBytes); nodeFsQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, nodeFsQuantity.capacity.Value()) } - nodeFsInodesQuantity, found := actualObservations[SignalNodeFsInodesFree] + nodeFsInodesQuantity, found := actualObservations[evictionapi.SignalNodeFsInodesFree] if !found { t.Errorf("Expected inodes free nodefs observation: %v", err) } @@ -776,7 +777,7 @@ func TestMakeSignalObservations(t *testing.T) { if expected := int64(nodeFsInodes); nodeFsInodesQuantity.capacity.Value() != expected { t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.capacity.Value()) } - imageFsQuantity, found := actualObservations[SignalImageFsAvailable] + imageFsQuantity, found := actualObservations[evictionapi.SignalImageFsAvailable] if !found { t.Errorf("Expected available imagefs observation: %v", err) } @@ -786,7 +787,7 @@ func TestMakeSignalObservations(t *testing.T) { if expectedBytes := int64(imageFsCapacityBytes); imageFsQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, imageFsQuantity.capacity.Value()) } - imageFsInodesQuantity, found := actualObservations[SignalImageFsInodesFree] + imageFsInodesQuantity, found := actualObservations[evictionapi.SignalImageFsInodesFree] if !found { t.Errorf("Expected inodes free imagefs observation: %v", err) } @@ -811,67 +812,67 @@ func TestMakeSignalObservations(t *testing.T) { } func TestThresholdsMet(t *testing.T) { - hardThreshold := Threshold{ - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + hardThreshold := evictionapi.Threshold{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("500Mi"), }, } testCases := map[string]struct { enforceMinReclaim bool - thresholds []Threshold + thresholds []evictionapi.Threshold observations signalObservations - result []Threshold + result []evictionapi.Threshold }{ "empty": { enforceMinReclaim: false, - thresholds: []Threshold{}, + thresholds: []evictionapi.Threshold{}, observations: signalObservations{}, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "threshold-met-memory": { enforceMinReclaim: false, - thresholds: []Threshold{hardThreshold}, + thresholds: []evictionapi.Threshold{hardThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("500Mi"), }, }, - result: []Threshold{hardThreshold}, + result: []evictionapi.Threshold{hardThreshold}, }, "threshold-not-met": { enforceMinReclaim: false, - thresholds: []Threshold{hardThreshold}, + thresholds: []evictionapi.Threshold{hardThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("2Gi"), }, }, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "threshold-met-with-min-reclaim": { enforceMinReclaim: true, - thresholds: []Threshold{hardThreshold}, + thresholds: []evictionapi.Threshold{hardThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("1.05Gi"), }, }, - result: []Threshold{hardThreshold}, + result: []evictionapi.Threshold{hardThreshold}, }, "threshold-not-met-with-min-reclaim": { enforceMinReclaim: true, - thresholds: []Threshold{hardThreshold}, + thresholds: []evictionapi.Threshold{hardThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("2Gi"), }, }, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, } for testName, testCase := range testCases { @@ -883,8 +884,8 @@ func TestThresholdsMet(t *testing.T) { } func TestThresholdsUpdatedStats(t *testing.T) { - updatedThreshold := Threshold{ - Signal: SignalMemoryAvailable, + updatedThreshold := evictionapi.Threshold{ + Signal: evictionapi.SignalMemoryAvailable, } locationUTC, err := time.LoadLocation("UTC") if err != nil { @@ -892,76 +893,76 @@ func TestThresholdsUpdatedStats(t *testing.T) { return } testCases := map[string]struct { - thresholds []Threshold + thresholds []evictionapi.Threshold observations signalObservations last signalObservations - result []Threshold + result []evictionapi.Threshold }{ "empty": { - thresholds: []Threshold{}, + thresholds: []evictionapi.Threshold{}, observations: signalObservations{}, last: signalObservations{}, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "no-time": { - thresholds: []Threshold{updatedThreshold}, + thresholds: []evictionapi.Threshold{updatedThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{}, + evictionapi.SignalMemoryAvailable: signalObservation{}, }, last: signalObservations{}, - result: []Threshold{updatedThreshold}, + result: []evictionapi.Threshold{updatedThreshold}, }, "no-last-observation": { - thresholds: []Threshold{updatedThreshold}, + thresholds: []evictionapi.Threshold{updatedThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 0, 0, 0, locationUTC), }, }, last: signalObservations{}, - result: []Threshold{updatedThreshold}, + result: []evictionapi.Threshold{updatedThreshold}, }, "time-machine": { - thresholds: []Threshold{updatedThreshold}, + thresholds: []evictionapi.Threshold{updatedThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 0, 0, 0, locationUTC), }, }, last: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 1, 0, 0, locationUTC), }, }, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "same-observation": { - thresholds: []Threshold{updatedThreshold}, + thresholds: []evictionapi.Threshold{updatedThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 0, 0, 0, locationUTC), }, }, last: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 0, 0, 0, locationUTC), }, }, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "new-observation": { - thresholds: []Threshold{updatedThreshold}, + thresholds: []evictionapi.Threshold{updatedThreshold}, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 1, 0, 0, locationUTC), }, }, last: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ time: metav1.Date(2016, 1, 1, 0, 0, 0, 0, locationUTC), }, }, - result: []Threshold{updatedThreshold}, + result: []evictionapi.Threshold{updatedThreshold}, }, } for testName, testCase := range testCases { @@ -973,21 +974,21 @@ func TestThresholdsUpdatedStats(t *testing.T) { } func TestPercentageThresholdsMet(t *testing.T) { - specificThresholds := []Threshold{ + specificThresholds := []evictionapi.Threshold{ { - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.2, }, - MinReclaim: &ThresholdValue{ + MinReclaim: &evictionapi.ThresholdValue{ Percentage: 0.05, }, }, { - Signal: SignalNodeFsAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + Signal: evictionapi.SignalNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Percentage: 0.3, }, }, @@ -995,19 +996,19 @@ func TestPercentageThresholdsMet(t *testing.T) { testCases := map[string]struct { enforceMinRelaim bool - thresholds []Threshold + thresholds []evictionapi.Threshold observations signalObservations - result []Threshold + result []evictionapi.Threshold }{ "BothMet": { enforceMinRelaim: false, thresholds: specificThresholds, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("100Mi"), capacity: quantityMustParse("1000Mi"), }, - SignalNodeFsAvailable: signalObservation{ + evictionapi.SignalNodeFsAvailable: signalObservation{ available: quantityMustParse("100Gi"), capacity: quantityMustParse("1000Gi"), }, @@ -1018,68 +1019,68 @@ func TestPercentageThresholdsMet(t *testing.T) { enforceMinRelaim: false, thresholds: specificThresholds, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("300Mi"), capacity: quantityMustParse("1000Mi"), }, - SignalNodeFsAvailable: signalObservation{ + evictionapi.SignalNodeFsAvailable: signalObservation{ available: quantityMustParse("400Gi"), capacity: quantityMustParse("1000Gi"), }, }, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "DiskMet": { enforceMinRelaim: false, thresholds: specificThresholds, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("300Mi"), capacity: quantityMustParse("1000Mi"), }, - SignalNodeFsAvailable: signalObservation{ + evictionapi.SignalNodeFsAvailable: signalObservation{ available: quantityMustParse("100Gi"), capacity: quantityMustParse("1000Gi"), }, }, - result: []Threshold{specificThresholds[1]}, + result: []evictionapi.Threshold{specificThresholds[1]}, }, "MemoryMet": { enforceMinRelaim: false, thresholds: specificThresholds, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("100Mi"), capacity: quantityMustParse("1000Mi"), }, - SignalNodeFsAvailable: signalObservation{ + evictionapi.SignalNodeFsAvailable: signalObservation{ available: quantityMustParse("400Gi"), capacity: quantityMustParse("1000Gi"), }, }, - result: []Threshold{specificThresholds[0]}, + result: []evictionapi.Threshold{specificThresholds[0]}, }, "MemoryMetWithMinReclaim": { enforceMinRelaim: true, thresholds: specificThresholds, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("225Mi"), capacity: quantityMustParse("1000Mi"), }, }, - result: []Threshold{specificThresholds[0]}, + result: []evictionapi.Threshold{specificThresholds[0]}, }, "MemoryNotMetWithMinReclaim": { enforceMinRelaim: true, thresholds: specificThresholds, observations: signalObservations{ - SignalMemoryAvailable: signalObservation{ + evictionapi.SignalMemoryAvailable: signalObservation{ available: quantityMustParse("300Mi"), capacity: quantityMustParse("1000Mi"), }, }, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, } for testName, testCase := range testCases { @@ -1091,29 +1092,29 @@ func TestPercentageThresholdsMet(t *testing.T) { } func TestThresholdsFirstObservedAt(t *testing.T) { - hardThreshold := Threshold{ - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + hardThreshold := evictionapi.Threshold{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, } now := metav1.Now() oldTime := metav1.NewTime(now.Time.Add(-1 * time.Minute)) testCases := map[string]struct { - thresholds []Threshold + thresholds []evictionapi.Threshold lastObservedAt thresholdsObservedAt now time.Time result thresholdsObservedAt }{ "empty": { - thresholds: []Threshold{}, + thresholds: []evictionapi.Threshold{}, lastObservedAt: thresholdsObservedAt{}, now: now.Time, result: thresholdsObservedAt{}, }, "no-previous-observation": { - thresholds: []Threshold{hardThreshold}, + thresholds: []evictionapi.Threshold{hardThreshold}, lastObservedAt: thresholdsObservedAt{}, now: now.Time, result: thresholdsObservedAt{ @@ -1121,7 +1122,7 @@ func TestThresholdsFirstObservedAt(t *testing.T) { }, }, "previous-observation": { - thresholds: []Threshold{hardThreshold}, + thresholds: []evictionapi.Threshold{hardThreshold}, lastObservedAt: thresholdsObservedAt{ hardThreshold: oldTime.Time, }, @@ -1141,17 +1142,17 @@ func TestThresholdsFirstObservedAt(t *testing.T) { func TestThresholdsMetGracePeriod(t *testing.T) { now := metav1.Now() - hardThreshold := Threshold{ - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + hardThreshold := evictionapi.Threshold{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("1Gi"), }, } - softThreshold := Threshold{ - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: ThresholdValue{ + softThreshold := evictionapi.Threshold{ + Signal: evictionapi.SignalMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ Quantity: quantityMustParse("2Gi"), }, GracePeriod: 1 * time.Minute, @@ -1160,33 +1161,33 @@ func TestThresholdsMetGracePeriod(t *testing.T) { testCases := map[string]struct { observedAt thresholdsObservedAt now time.Time - result []Threshold + result []evictionapi.Threshold }{ "empty": { observedAt: thresholdsObservedAt{}, now: now.Time, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "hard-threshold-met": { observedAt: thresholdsObservedAt{ hardThreshold: now.Time, }, now: now.Time, - result: []Threshold{hardThreshold}, + result: []evictionapi.Threshold{hardThreshold}, }, "soft-threshold-not-met": { observedAt: thresholdsObservedAt{ softThreshold: now.Time, }, now: now.Time, - result: []Threshold{}, + result: []evictionapi.Threshold{}, }, "soft-threshold-met": { observedAt: thresholdsObservedAt{ softThreshold: oldTime.Time, }, now: now.Time, - result: []Threshold{softThreshold}, + result: []evictionapi.Threshold{softThreshold}, }, } for testName, testCase := range testCases { @@ -1199,16 +1200,16 @@ func TestThresholdsMetGracePeriod(t *testing.T) { func TestNodeConditions(t *testing.T) { testCases := map[string]struct { - inputs []Threshold + inputs []evictionapi.Threshold result []v1.NodeConditionType }{ "empty-list": { - inputs: []Threshold{}, + inputs: []evictionapi.Threshold{}, result: []v1.NodeConditionType{}, }, "memory.available": { - inputs: []Threshold{ - {Signal: SignalMemoryAvailable}, + inputs: []evictionapi.Threshold{ + {Signal: evictionapi.SignalMemoryAvailable}, }, result: []v1.NodeConditionType{v1.NodeMemoryPressure}, }, @@ -1327,24 +1328,24 @@ func TestHasNodeConditions(t *testing.T) { func TestGetStarvedResources(t *testing.T) { testCases := map[string]struct { - inputs []Threshold + inputs []evictionapi.Threshold result []v1.ResourceName }{ "memory.available": { - inputs: []Threshold{ - {Signal: SignalMemoryAvailable}, + inputs: []evictionapi.Threshold{ + {Signal: evictionapi.SignalMemoryAvailable}, }, result: []v1.ResourceName{v1.ResourceMemory}, }, "imagefs.available": { - inputs: []Threshold{ - {Signal: SignalImageFsAvailable}, + inputs: []evictionapi.Threshold{ + {Signal: evictionapi.SignalImageFsAvailable}, }, result: []v1.ResourceName{resourceImageFs}, }, "nodefs.available": { - inputs: []Threshold{ - {Signal: SignalNodeFsAvailable}, + inputs: []evictionapi.Threshold{ + {Signal: evictionapi.SignalNodeFsAvailable}, }, result: []v1.ResourceName{resourceNodeFs}, }, @@ -1397,50 +1398,50 @@ func testParsePercentage(t *testing.T) { func testCompareThresholdValue(t *testing.T) { testCases := []struct { - a, b ThresholdValue + a, b evictionapi.ThresholdValue equal bool }{ { - a: ThresholdValue{ + a: evictionapi.ThresholdValue{ Quantity: resource.NewQuantity(123, resource.BinarySI), }, - b: ThresholdValue{ + b: evictionapi.ThresholdValue{ Quantity: resource.NewQuantity(123, resource.BinarySI), }, equal: true, }, { - a: ThresholdValue{ + a: evictionapi.ThresholdValue{ Quantity: resource.NewQuantity(123, resource.BinarySI), }, - b: ThresholdValue{ + b: evictionapi.ThresholdValue{ Quantity: resource.NewQuantity(456, resource.BinarySI), }, equal: false, }, { - a: ThresholdValue{ + a: evictionapi.ThresholdValue{ Quantity: resource.NewQuantity(123, resource.BinarySI), }, - b: ThresholdValue{ + b: evictionapi.ThresholdValue{ Percentage: 0.1, }, equal: false, }, { - a: ThresholdValue{ + a: evictionapi.ThresholdValue{ Percentage: 0.1, }, - b: ThresholdValue{ + b: evictionapi.ThresholdValue{ Percentage: 0.1, }, equal: true, }, { - a: ThresholdValue{ + a: evictionapi.ThresholdValue{ Percentage: 0.2, }, - b: ThresholdValue{ + b: evictionapi.ThresholdValue{ Percentage: 0.1, }, equal: false, @@ -1601,7 +1602,7 @@ func (s1 nodeConditionList) Equal(s2 nodeConditionList) bool { } // thresholdList is a simple alias to support equality checking independent of order -type thresholdList []Threshold +type thresholdList []evictionapi.Threshold // Equal adds the ability to check equality between two lists of node conditions. func (s1 thresholdList) Equal(s2 thresholdList) bool { diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index a006f0593d..c1c7d1f60a 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -23,22 +23,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" -) - -// Signal defines a signal that can trigger eviction of pods on a node. -type Signal string - -const ( - // SignalMemoryAvailable is memory available (i.e. capacity - workingSet), in bytes. - SignalMemoryAvailable Signal = "memory.available" - // SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc. - SignalNodeFsAvailable Signal = "nodefs.available" - // SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc. - SignalNodeFsInodesFree Signal = "nodefs.inodesFree" - // SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers. - SignalImageFsAvailable Signal = "imagefs.available" - // SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers. - SignalImageFsInodesFree Signal = "imagefs.inodesFree" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" ) // fsStatsType defines the types of filesystem stats to collect. @@ -53,14 +38,6 @@ const ( fsStatsRoot fsStatsType = "root" ) -// ThresholdOperator is the operator used to express a Threshold. -type ThresholdOperator string - -const ( - // OpLessThan is the operator that expresses a less than operator. - OpLessThan ThresholdOperator = "LessThan" -) - // Config holds information about how eviction is configured. type Config struct { // PressureTransitionPeriod is duration the kubelet has to wait before transititioning out of a pressure condition. @@ -68,35 +45,11 @@ type Config struct { // Maximum allowed grace period (in seconds) to use when terminating pods in response to a soft eviction threshold being met. MaxPodGracePeriodSeconds int64 // Thresholds define the set of conditions monitored to trigger eviction. - Thresholds []Threshold + Thresholds []evictionapi.Threshold // KernelMemcgNotification if true will integrate with the kernel memcg notification to determine if memory thresholds are crossed. KernelMemcgNotification bool } -// ThresholdValue is a value holder that abstracts literal versus percentage based quantity -type ThresholdValue struct { - // The following fields are exclusive. Only the topmost non-zero field is used. - - // Quantity is a quantity associated with the signal that is evaluated against the specified operator. - Quantity *resource.Quantity - // Percentage represents the usage percentage over the total resource that is evaluated against the specified operator. - Percentage float32 -} - -// Threshold defines a metric for when eviction should occur. -type Threshold struct { - // Signal defines the entity that was measured. - Signal Signal - // Operator represents a relationship of a signal to a value. - Operator ThresholdOperator - // Value is the threshold the resource is evaluated against. - Value ThresholdValue - // GracePeriod represents the amount of time that a threshold must be met before eviction is triggered. - GracePeriod time.Duration - // MinReclaim represents the minimum amount of resource to reclaim if the threshold is met. - MinReclaim *ThresholdValue -} - // Manager evaluates when an eviction threshold for node stability has been met on the node. type Manager interface { // Start starts the control loop to monitor eviction thresholds at specified interval. @@ -150,10 +103,10 @@ type signalObservation struct { } // signalObservations maps a signal to an observed quantity -type signalObservations map[Signal]signalObservation +type signalObservations map[evictionapi.Signal]signalObservation // thresholdsObservedAt maps a threshold to a time that it was observed -type thresholdsObservedAt map[Threshold]time.Time +type thresholdsObservedAt map[evictionapi.Threshold]time.Time // nodeConditionsObservedAt maps a node condition to a time that it was observed type nodeConditionsObservedAt map[v1.NodeConditionType]time.Time diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 34b6fa6269..878c955650 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -34,7 +34,6 @@ import ( clientgoclientset "k8s.io/client-go/kubernetes" cadvisorapi "github.com/google/cadvisor/info/v1" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" @@ -359,11 +358,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub KernelMemcgNotification: kubeCfg.ExperimentalKernelMemcgNotification, } - reservation, err := ParseReservation(kubeCfg.KubeReserved, kubeCfg.SystemReserved) - if err != nil { - return nil, err - } - var dockerExecHandler dockertools.ExecHandler switch kubeCfg.DockerExecHandlerName { case "native": @@ -465,7 +459,6 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub nodeIP: net.ParseIP(kubeCfg.NodeIP), clock: clock.RealClock{}, outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration, - reservation: *reservation, enableCustomMetrics: kubeCfg.EnableCustomMetrics, babysitDaemons: kubeCfg.BabysitDaemons, enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach, @@ -1034,10 +1027,6 @@ type Kubelet struct { // getting rescheduled onto the node. outOfDiskTransitionFrequency time.Duration - // reservation specifies resources which are reserved for non-pod usage, including kubernetes and - // non-kubernetes system processes. - reservation kubetypes.Reservation - // support gathering custom metrics. enableCustomMetrics bool @@ -2119,47 +2108,6 @@ func isSyncPodWorthy(event *pleg.PodLifecycleEvent) bool { return event.Type != pleg.ContainerRemoved } -// parseResourceList parses the given configuration map into an API -// ResourceList or returns an error. -func parseResourceList(m componentconfig.ConfigurationMap) (v1.ResourceList, error) { - rl := make(v1.ResourceList) - for k, v := range m { - switch v1.ResourceName(k) { - // Only CPU and memory resources are supported. - case v1.ResourceCPU, v1.ResourceMemory: - q, err := resource.ParseQuantity(v) - if err != nil { - return nil, err - } - if q.Sign() == -1 { - return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v) - } - rl[v1.ResourceName(k)] = q - default: - return nil, fmt.Errorf("cannot reserve %q resource", k) - } - } - return rl, nil -} - -// ParseReservation parses the given kubelet- and system- reservations -// configuration maps into an internal Reservation instance or returns an -// error. -func ParseReservation(kubeReserved, systemReserved componentconfig.ConfigurationMap) (*kubetypes.Reservation, error) { - reservation := new(kubetypes.Reservation) - if rl, err := parseResourceList(kubeReserved); err != nil { - return nil, err - } else { - reservation.Kubernetes = rl - } - if rl, err := parseResourceList(systemReserved); err != nil { - return nil, err - } else { - reservation.System = rl - } - return reservation, nil -} - // Gets the streaming server configuration to use with in-process CRI shims. func getStreamingConfig(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *KubeletDeps) *streaming.Config { config := &streaming.Config{ diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index b3064c26ab..50b1aeffc8 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -522,18 +522,14 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { } // Set Allocatable. - node.Status.Allocatable = make(v1.ResourceList) + if node.Status.Allocatable == nil { + node.Status.Allocatable = make(v1.ResourceList) + } + allocatableReservation := kl.containerManager.GetNodeAllocatableReservation() for k, v := range node.Status.Capacity { value := *(v.Copy()) - if kl.reservation.System != nil { - value.Sub(kl.reservation.System[k]) - } - if kl.reservation.Kubernetes != nil { - value.Sub(kl.reservation.Kubernetes[k]) - } - if value.Sign() < 0 { - // Negative Allocatable resources don't make sense. - value.Set(0) + if res, exists := allocatableReservation[k]; exists { + value.Sub(res) } node.Status.Allocatable[k] = value } diff --git a/pkg/kubelet/kubelet_node_status_test.go b/pkg/kubelet/kubelet_node_status_test.go index 026326fb54..a77622728b 100644 --- a/pkg/kubelet/kubelet_node_status_test.go +++ b/pkg/kubelet/kubelet_node_status_test.go @@ -41,6 +41,7 @@ import ( core "k8s.io/client-go/testing" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/client/clientset_generated/clientset/fake" + "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/util/sliceutils" "k8s.io/kubernetes/pkg/version" @@ -109,6 +110,15 @@ func applyNodeStatusPatch(originalNode *v1.Node, patch []byte) (*v1.Node, error) return updatedNode, nil } +type localCM struct { + cm.ContainerManager + allocatable v1.ResourceList +} + +func (lcm *localCM) GetNodeAllocatableReservation() v1.ResourceList { + return lcm.allocatable +} + func TestUpdateNewNodeStatus(t *testing.T) { // generate one more than maxImagesInNodeStatus in inputImageList inputImageList, expectedImageList := generateTestingImageList(maxImagesInNodeStatus + 1) @@ -116,6 +126,13 @@ func TestUpdateNewNodeStatus(t *testing.T) { t, inputImageList, false /* controllerAttachDetachEnabled */) defer testKubelet.Cleanup() kubelet := testKubelet.kubelet + kubelet.containerManager = &localCM{ + ContainerManager: cm.NewStubContainerManager(), + allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + }, + } kubeClient := testKubelet.fakeKubeClient existingNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname}} kubeClient.ReactionChain = fake.NewSimpleClientset(&v1.NodeList{Items: []v1.Node{existingNode}}).ReactionChain @@ -332,6 +349,14 @@ func TestUpdateExistingNodeStatus(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) defer testKubelet.Cleanup() kubelet := testKubelet.kubelet + kubelet.containerManager = &localCM{ + ContainerManager: cm.NewStubContainerManager(), + allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + }, + } + kubeClient := testKubelet.fakeKubeClient existingNode := v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname}, @@ -377,9 +402,10 @@ func TestUpdateExistingNodeStatus(t *testing.T) { v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI), }, }, } @@ -687,6 +713,14 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) defer testKubelet.Cleanup() kubelet := testKubelet.kubelet + kubelet.containerManager = &localCM{ + ContainerManager: cm.NewStubContainerManager(), + allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + }, + } + clock := testKubelet.fakeClock kubeClient := testKubelet.fakeKubeClient existingNode := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname}} diff --git a/pkg/kubelet/kubelet_resources.go b/pkg/kubelet/kubelet_resources.go index ea2a3033b6..21fecbdee8 100644 --- a/pkg/kubelet/kubelet_resources.go +++ b/pkg/kubelet/kubelet_resources.go @@ -19,6 +19,8 @@ package kubelet import ( "fmt" + "github.com/golang/glog" + "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/fieldpath" @@ -41,7 +43,7 @@ func (kl *Kubelet) defaultPodLimitsForDownwardApi(pod *v1.Pod, container *v1.Con return nil, nil, fmt.Errorf("failed to find node object, expected a node") } allocatable := node.Status.Allocatable - + glog.Errorf("allocatable: %v", allocatable) podCopy, err := api.Scheme.Copy(pod) if err != nil { return nil, nil, fmt.Errorf("failed to perform a deep copy of pod object: %v", err) diff --git a/pkg/kubelet/kubelet_resources_test.go b/pkg/kubelet/kubelet_resources_test.go index 2c8096f154..a5505d1240 100644 --- a/pkg/kubelet/kubelet_resources_test.go +++ b/pkg/kubelet/kubelet_resources_test.go @@ -25,8 +25,8 @@ import ( cadvisorapiv2 "github.com/google/cadvisor/info/v2" apiequality "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/kubernetes/pkg/api/v1" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) func TestPodResourceLimitsDefaulting(t *testing.T) { @@ -41,18 +41,21 @@ func TestPodResourceLimitsDefaulting(t *testing.T) { }, nil) tk.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) tk.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) - - tk.kubelet.reservation = kubetypes.Reservation{ - Kubernetes: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("3"), - v1.ResourceMemory: resource.MustParse("4Gi"), - }, - System: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse("1"), - v1.ResourceMemory: resource.MustParse("2Gi"), + tk.kubelet.nodeInfo = &testNodeInfo{ + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: string(tk.kubelet.nodeName), + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("6"), + v1.ResourceMemory: resource.MustParse("4Gi"), + }, + }, + }, }, } - cases := []struct { pod *v1.Pod expected *v1.Pod diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 718523cd67..2c49e7085e 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -222,12 +222,6 @@ func newTestKubeletWithImageList( kubelet.backOff.Clock = fakeClock kubelet.podKillingCh = make(chan *kubecontainer.PodPair, 20) kubelet.resyncInterval = 10 * time.Second - kubelet.reservation = kubetypes.Reservation{ - Kubernetes: v1.ResourceList{ - v1.ResourceCPU: resource.MustParse(testReservationCPU), - v1.ResourceMemory: resource.MustParse(testReservationMemory), - }, - } kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock) // Relist period does not affect the tests. kubelet.pleg = pleg.NewGenericPLEG(fakeRuntime, 100, time.Hour, nil, clock.RealClock{}) diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 1efadf01ad..31cf23408a 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -50,7 +50,6 @@ go_test( name = "go_default_test", srcs = [ "apparmor_test.go", - "cgroup_manager_test.go", "container_manager_test.go", "critical_pod_test.go", "density_test.go", @@ -65,6 +64,8 @@ go_test( "log_path_test.go", "memory_eviction_test.go", "mirror_pod_test.go", + "node_container_manager_test.go", + "pods_container_manager_test.go", "resource_usage_test.go", "restart_test.go", "runtime_conformance_test.go", @@ -117,6 +118,7 @@ go_test( "//vendor:k8s.io/apimachinery/pkg/util/intstr", "//vendor:k8s.io/apimachinery/pkg/util/uuid", "//vendor:k8s.io/apimachinery/pkg/watch", + "//vendor:k8s.io/client-go/pkg/api", "//vendor:k8s.io/client-go/tools/cache", ], ) diff --git a/test/e2e_node/container_manager_test.go b/test/e2e_node/container_manager_test.go index f723af6d1c..88b424744d 100644 --- a/test/e2e_node/container_manager_test.go +++ b/test/e2e_node/container_manager_test.go @@ -70,9 +70,8 @@ func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expect return nil } -var _ = framework.KubeDescribe("Kubelet Container Manager [Serial]", func() { +var _ = framework.KubeDescribe("Container Manager Misc [Serial]", func() { f := framework.NewDefaultFramework("kubelet-container-manager") - Describe("Validate OOM score adjustments", func() { Context("once the node is setup", func() { It("docker daemon's oom-score-adj should be -999", func() { diff --git a/test/e2e_node/node_container_manager_test.go b/test/e2e_node/node_container_manager_test.go new file mode 100644 index 0000000000..b1684a022e --- /dev/null +++ b/test/e2e_node/node_container_manager_test.go @@ -0,0 +1,247 @@ +// +build linux + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_node + +import ( + "fmt" + "io/ioutil" + "path" + "path/filepath" + "strconv" + "strings" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/pkg/api" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" + "k8s.io/kubernetes/pkg/kubelet/cm" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" +) + +func setDesiredConfiguration(initialConfig *componentconfig.KubeletConfiguration) { + initialConfig.EnforceNodeAllocatable = []string{"pods", "kube-reserved", "system-reserved"} + initialConfig.SystemReserved = componentconfig.ConfigurationMap{ + "cpu": "100m", + "memory": "100Mi", + } + initialConfig.KubeReserved = componentconfig.ConfigurationMap{ + "cpu": "100m", + "memory": "100Mi", + } + initialConfig.EvictionHard = "memory.available<100Mi" + // Necessary for allocatable cgroup creation. + initialConfig.CgroupsPerQOS = true + initialConfig.KubeReservedCgroup = kubeReservedCgroup + initialConfig.SystemReservedCgroup = systemReservedCgroup +} + +var _ = framework.KubeDescribe("Node Container Manager [Serial]", func() { + f := framework.NewDefaultFramework("node-container-manager") + Describe("Validate Node Allocatable", func() { + It("set's up the node and runs the test", func() { + framework.ExpectNoError(runTest(f)) + }) + }) + +}) + +func expectFileValToEqual(filePath string, expectedValue, delta int64) error { + out, err := ioutil.ReadFile(filePath) + if err != nil { + return fmt.Errorf("failed to read file %q", filePath) + } + actual, err := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64) + if err != nil { + return fmt.Errorf("failed to parse output %v", err) + } + + // Ensure that values are within a delta range to work arounding rounding errors. + if (actual < (expectedValue - delta)) || (actual > (expectedValue + delta)) { + return fmt.Errorf("Expected value at %q to be between %d and %d. Got %d", filePath, (expectedValue - delta), (expectedValue + delta), actual) + } + return nil +} + +func getAllocatableLimits(cpu, memory string, capacity v1.ResourceList) (*resource.Quantity, *resource.Quantity) { + var allocatableCPU, allocatableMemory *resource.Quantity + // Total cpu reservation is 200m. + for k, v := range capacity { + if k == v1.ResourceCPU { + allocatableCPU = v.Copy() + allocatableCPU.Sub(resource.MustParse(cpu)) + } + if k == v1.ResourceMemory { + allocatableMemory = v.Copy() + allocatableMemory.Sub(resource.MustParse(memory)) + } + } + return allocatableCPU, allocatableMemory +} + +const ( + kubeReservedCgroup = "/kube_reserved" + systemReservedCgroup = "/system_reserved" +) + +func createIfNotExists(cm cm.CgroupManager, cgroupConfig *cm.CgroupConfig) error { + if !cm.Exists(cgroupConfig.Name) { + if err := cm.Create(cgroupConfig); err != nil { + return err + } + } + return nil +} + +func createTemporaryCgroupsForReservation(cgroupManager cm.CgroupManager) error { + // Create kube reserved cgroup + cgroupConfig := &cm.CgroupConfig{ + Name: cm.CgroupName(kubeReservedCgroup), + } + if err := createIfNotExists(cgroupManager, cgroupConfig); err != nil { + return err + } + // Create system reserved cgroup + cgroupConfig.Name = cm.CgroupName(systemReservedCgroup) + + return createIfNotExists(cgroupManager, cgroupConfig) +} + +func destroyTemporaryCgroupsForReservation(cgroupManager cm.CgroupManager) error { + // Create kube reserved cgroup + cgroupConfig := &cm.CgroupConfig{ + Name: cm.CgroupName(kubeReservedCgroup), + } + if err := cgroupManager.Destroy(cgroupConfig); err != nil { + return err + } + cgroupConfig.Name = cm.CgroupName(systemReservedCgroup) + return cgroupManager.Destroy(cgroupConfig) +} + +func runTest(f *framework.Framework) error { + var oldCfg *componentconfig.KubeletConfiguration + subsystems, err := cm.GetCgroupSubsystems() + if err != nil { + return err + } + // Get current kubelet configuration + oldCfg, err = getCurrentKubeletConfig() + if err != nil { + return err + } + + // Create a cgroup manager object for manipulating cgroups. + cgroupManager := cm.NewCgroupManager(subsystems, oldCfg.CgroupDriver) + + defer destroyTemporaryCgroupsForReservation(cgroupManager) + defer func() { + if oldCfg != nil { + framework.ExpectNoError(setKubeletConfiguration(f, oldCfg)) + } + }() + if err := createTemporaryCgroupsForReservation(cgroupManager); err != nil { + return err + } + clone, err := api.Scheme.DeepCopy(oldCfg) + if err != nil { + return err + } + newCfg := clone.(*componentconfig.KubeletConfiguration) + // Change existing kubelet configuration + setDesiredConfiguration(newCfg) + // Set the new kubelet configuration. + err = setKubeletConfiguration(f, newCfg) + if err != nil { + return err + } + // Set new config and current config. + currentConfig := newCfg + + expectedNAPodCgroup := path.Join(currentConfig.CgroupRoot, "kubepods") + if !cgroupManager.Exists(cm.CgroupName(expectedNAPodCgroup)) { + return fmt.Errorf("Expected Node Allocatable Cgroup Does not exist") + } + // TODO: Update cgroupManager to expose a Status interface to get current Cgroup Settings. + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + if err != nil { + return err + } + if len(nodeList.Items) != 1 { + return fmt.Errorf("Unexpected number of node objects for node e2e. Expects only one node: %+V", nodeList) + } + node := nodeList.Items[0] + capacity := node.Status.Capacity + allocatableCPU, allocatableMemory := getAllocatableLimits("200m", "200Mi", capacity) + // Total Memory reservation is 200Mi excluding eviction thresholds. + // Expect CPU shares on node allocatable cgroup to equal allocatable. + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], "kubepods", "cpu.shares"), cm.MilliCPUToShares(allocatableCPU.MilliValue()), 10); err != nil { + return err + } + // Expect Memory limit on node allocatable cgroup to equal allocatable. + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], "kubepods", "memory.limit_in_bytes"), allocatableMemory.Value(), 0); err != nil { + return err + } + + // Check that Allocatable reported to scheduler includes eviction thresholds. + schedulerAllocatable := node.Status.Allocatable + // Memory allocatable should take into account eviction thresholds. + allocatableCPU, allocatableMemory = getAllocatableLimits("200m", "300Mi", capacity) + // Expect allocatable to include all resources in capacity. + if len(schedulerAllocatable) != len(capacity) { + return fmt.Errorf("Expected all resources in capacity to be found in allocatable") + } + // CPU based evictions are not supported. + if allocatableCPU.Cmp(schedulerAllocatable["cpu"]) != 0 { + return fmt.Errorf("Unexpected cpu allocatable value exposed by the node. Expected: %v, got: %v, capacity: %v", allocatableCPU, schedulerAllocatable["cpu"], capacity["cpu"]) + } + if allocatableMemory.Cmp(schedulerAllocatable["memory"]) != 0 { + return fmt.Errorf("Unexpected cpu allocatable value exposed by the node. Expected: %v, got: %v, capacity: %v", allocatableCPU, schedulerAllocatable["cpu"], capacity["memory"]) + } + + if !cgroupManager.Exists(cm.CgroupName(kubeReservedCgroup)) { + return fmt.Errorf("Expected kube reserved cgroup Does not exist") + } + // Expect CPU shares on kube reserved cgroup to equal it's reservation which is `100m`. + kubeReservedCPU := resource.MustParse(currentConfig.KubeReserved["cpu"]) + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], kubeReservedCgroup, "cpu.shares"), cm.MilliCPUToShares(kubeReservedCPU.MilliValue()), 10); err != nil { + return err + } + // Expect Memory limit kube reserved cgroup to equal configured value `100Mi`. + kubeReservedMemory := resource.MustParse(currentConfig.KubeReserved["memory"]) + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], kubeReservedCgroup, "memory.limit_in_bytes"), kubeReservedMemory.Value(), 0); err != nil { + return err + } + if !cgroupManager.Exists(cm.CgroupName(systemReservedCgroup)) { + return fmt.Errorf("Expected system reserved cgroup Does not exist") + } + // Expect CPU shares on system reserved cgroup to equal it's reservation which is `100m`. + systemReservedCPU := resource.MustParse(currentConfig.SystemReserved["cpu"]) + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], systemReservedCgroup, "cpu.shares"), cm.MilliCPUToShares(systemReservedCPU.MilliValue()), 10); err != nil { + return err + } + // Expect Memory limit on node allocatable cgroup to equal allocatable. + systemReservedMemory := resource.MustParse(currentConfig.SystemReserved["memory"]) + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], systemReservedCgroup, "memory.limit_in_bytes"), systemReservedMemory.Value(), 0); err != nil { + return err + } + return nil +} diff --git a/test/e2e_node/cgroup_manager_test.go b/test/e2e_node/pods_container_manager_test.go similarity index 96% rename from test/e2e_node/cgroup_manager_test.go rename to test/e2e_node/pods_container_manager_test.go index 5f6095dc4f..043443c4fd 100644 --- a/test/e2e_node/cgroup_manager_test.go +++ b/test/e2e_node/pods_container_manager_test.go @@ -17,6 +17,8 @@ limitations under the License. package e2e_node import ( + "path" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" @@ -24,6 +26,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/test/e2e/framework" + "github.com/golang/glog" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" ) @@ -49,18 +52,23 @@ func getResourceRequirements(requests, limits v1.ResourceList) v1.ResourceRequir return res } +// Kubelet internal cgroup name for node allocatable cgroup. +const defaultNodeAllocatableCgroup = "kubepods" + // makePodToVerifyCgroups returns a pod that verifies the existence of the specified cgroups. func makePodToVerifyCgroups(cgroupNames []cm.CgroupName) *v1.Pod { // convert the names to their literal cgroupfs forms... cgroupFsNames := []string{} for _, cgroupName := range cgroupNames { + // Add top level cgroup used to enforce node allocatable. + cgroupName = cm.CgroupName(path.Join(defaultNodeAllocatableCgroup, string(cgroupName))) if framework.TestContext.KubeletConfig.CgroupDriver == "systemd" { cgroupFsNames = append(cgroupFsNames, cm.ConvertCgroupNameToSystemd(cgroupName, true)) } else { cgroupFsNames = append(cgroupFsNames, string(cgroupName)) } } - + glog.Infof("expecting %v cgroups to be found", cgroupFsNames) // build the pod command to either verify cgroups exist command := "" for _, cgroupFsName := range cgroupFsNames { diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 1bbd4d1d8f..6f6361091e 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -95,6 +95,7 @@ func tempSetEvictionHard(f *framework.Framework, evictionHard string) { // Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context. // The change is reverted in the AfterEach of the context. +// Returns true on success. func tempSetCurrentKubeletConfig(f *framework.Framework, updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) { var oldCfg *componentconfig.KubeletConfiguration BeforeEach(func() { @@ -292,3 +293,9 @@ func logNodeEvents(f *framework.Framework) { err := framework.ListNamespaceEvents(f.ClientSet, "") framework.ExpectNoError(err) } + +func getLocalNode(f *framework.Framework) *v1.Node { + nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet) + Expect(len(nodeList.Items)).To(Equal(1), "Unexpected number of node objects for node e2e. Expects only one node.") + return &nodeList.Items[0] +}