Adds support for PodCIDR allocation from the GCE cloud provider

If CIDRAllocatorType is set to `CloudCIDRAllocator`, then allocation
of CIDR allocation instead is done by the external cloud provider and
the node controller is only responsible for reflecting the allocation
into the node spec.

- Splits off the rangeAllocator from the cidr_allocator.go file.
- Adds cloudCIDRAllocator, which is used when the cloud provider allocates
  the CIDR ranges externally. (GCE support only)
- Updates RBAC permission for node controller to include PATCH
pull/6/head
Bowei Du 2017-02-27 00:33:55 -08:00
parent 345c65847f
commit f61590c221
12 changed files with 614 additions and 439 deletions

View File

@ -477,6 +477,7 @@ func StartControllers(controllers map[string]InitFunc, s *options.CMServer, root
serviceCIDR,
int(s.NodeCIDRMaskSize),
s.AllocateNodeCIDRs,
nodecontroller.CIDRAllocatorType(s.CIDRAllocatorType),
s.EnableTaintManager,
utilfeature.DefaultFeatureGate.Enabled(features.TaintBasedEvictions),
)

View File

@ -188,7 +188,10 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled
fs.StringVar(&s.ClusterCIDR, "cluster-cidr", s.ClusterCIDR, "CIDR Range for Pods in cluster.")
fs.StringVar(&s.ServiceCIDR, "service-cluster-ip-range", s.ServiceCIDR, "CIDR Range for Services in cluster.")
fs.Int32Var(&s.NodeCIDRMaskSize, "node-cidr-mask-size", s.NodeCIDRMaskSize, "Mask size for node cidr in cluster.")
fs.BoolVar(&s.AllocateNodeCIDRs, "allocate-node-cidrs", false, "Should CIDRs for Pods be allocated and set on the cloud provider.")
fs.BoolVar(&s.AllocateNodeCIDRs, "allocate-node-cidrs", false,
"Should CIDRs for Pods be allocated and set on the cloud provider.")
fs.StringVar(&s.CIDRAllocatorType, "cidr-allocator-type", "RangeAllocator",
"Type of CIDR allocator to use")
fs.BoolVar(&s.ConfigureCloudRoutes, "configure-cloud-routes", true, "Should CIDRs allocated by allocate-node-cidrs be configured on the cloud provider.")
fs.StringVar(&s.Master, "master", s.Master, "The address of the Kubernetes API server (overrides any value in kubeconfig)")
fs.StringVar(&s.Kubeconfig, "kubeconfig", s.Kubeconfig, "Path to kubeconfig file with authorization and master location information.")

View File

@ -794,9 +794,11 @@ type KubeControllerManagerConfiguration struct {
ServiceCIDR string
// NodeCIDRMaskSize is the mask size for node cidr in cluster.
NodeCIDRMaskSize int32
// allocateNodeCIDRs enables CIDRs for Pods to be allocated and, if
// AllocateNodeCIDRs enables CIDRs for Pods to be allocated and, if
// ConfigureCloudRoutes is true, to be set on the cloud provider.
AllocateNodeCIDRs bool
// CIDRAllocatorType determines what kind of pod CIDR allocator will be used.
CIDRAllocatorType string
// configureCloudRoutes enables CIDRs allocated with allocateNodeCIDRs
// to be configured on the cloud provider.
ConfigureCloudRoutes bool

View File

@ -217,10 +217,10 @@ func (gce *GCECloud) CurrentNodeName(hostname string) (types.NodeName, error) {
return types.NodeName(hostname), nil
}
// PodCIDRs returns a list of CIDR ranges that are assigned to the
// AliasRanges returns a list of CIDR ranges that are assigned to the
// `node` for allocation to pods. Returns a list of the form
// "<ip>/<netmask>".
func (gce *GCECloud) PodCIDRs(nodeName types.NodeName) (cidrs []string, err error) {
func (gce *GCECloud) AliasRanges(nodeName types.NodeName) (cidrs []string, err error) {
var instance *gceInstance
instance, err = gce.getInstanceByName(mapNodeNameToInstanceName(nodeName))
if err != nil {

View File

@ -8,56 +8,6 @@ load(
"go_test",
)
go_library(
name = "go_default_library",
srcs = [
"cidr_allocator.go",
"cidr_set.go",
"controller_utils.go",
"doc.go",
"metrics.go",
"nodecontroller.go",
"rate_limited_queue.go",
"taint_controller.go",
"timed_workers.go",
],
tags = ["automanaged"],
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/client/clientset_generated/clientset:go_default_library",
"//pkg/client/informers/informers_generated/externalversions/core/v1:go_default_library",
"//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library",
"//pkg/client/listers/core/v1:go_default_library",
"//pkg/client/listers/extensions/v1beta1:go_default_library",
"//pkg/cloudprovider:go_default_library",
"//pkg/controller:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/util/metrics:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/system:go_default_library",
"//pkg/util/version:go_default_library",
"//vendor:github.com/golang/glog",
"//vendor:github.com/prometheus/client_golang/prometheus",
"//vendor:k8s.io/apimachinery/pkg/api/equality",
"//vendor:k8s.io/apimachinery/pkg/api/errors",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/fields",
"//vendor:k8s.io/apimachinery/pkg/labels",
"//vendor:k8s.io/apimachinery/pkg/types",
"//vendor:k8s.io/apimachinery/pkg/util/errors",
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/util/wait",
"//vendor:k8s.io/client-go/kubernetes/typed/core/v1",
"//vendor:k8s.io/client-go/pkg/api/v1",
"//vendor:k8s.io/client-go/tools/cache",
"//vendor:k8s.io/client-go/tools/record",
"//vendor:k8s.io/client-go/util/flowcontrol",
"//vendor:k8s.io/client-go/util/workqueue",
],
)
go_test(
name = "go_default_test",
srcs = [
@ -96,6 +46,59 @@ go_test(
],
)
go_library(
name = "go_default_library",
srcs = [
"cidr_allocator.go",
"cidr_set.go",
"cloud_cidr_allocator.go",
"controller_utils.go",
"doc.go",
"metrics.go",
"nodecontroller.go",
"range_allocator.go",
"rate_limited_queue.go",
"taint_controller.go",
"timed_workers.go",
],
tags = ["automanaged"],
deps = [
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/client/clientset_generated/clientset:go_default_library",
"//pkg/client/informers/informers_generated/externalversions/core/v1:go_default_library",
"//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library",
"//pkg/client/listers/core/v1:go_default_library",
"//pkg/client/listers/extensions/v1beta1:go_default_library",
"//pkg/cloudprovider:go_default_library",
"//pkg/cloudprovider/providers/gce:go_default_library",
"//pkg/controller:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/util/metrics:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/system:go_default_library",
"//pkg/util/version:go_default_library",
"//vendor:github.com/golang/glog",
"//vendor:github.com/prometheus/client_golang/prometheus",
"//vendor:k8s.io/apimachinery/pkg/api/equality",
"//vendor:k8s.io/apimachinery/pkg/api/errors",
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
"//vendor:k8s.io/apimachinery/pkg/fields",
"//vendor:k8s.io/apimachinery/pkg/labels",
"//vendor:k8s.io/apimachinery/pkg/types",
"//vendor:k8s.io/apimachinery/pkg/util/errors",
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/util/wait",
"//vendor:k8s.io/client-go/kubernetes/typed/core/v1",
"//vendor:k8s.io/client-go/pkg/api/v1",
"//vendor:k8s.io/client-go/tools/cache",
"//vendor:k8s.io/client-go/tools/record",
"//vendor:k8s.io/client-go/util/flowcontrol",
"//vendor:k8s.io/client-go/util/workqueue",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),

View File

@ -18,259 +18,34 @@ package node
import (
"errors"
"fmt"
"net"
"sync"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
clientv1 "k8s.io/client-go/pkg/api/v1"
"k8s.io/client-go/tools/record"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
"github.com/golang/glog"
v1 "k8s.io/kubernetes/pkg/api/v1"
)
// TODO: figure out the good setting for those constants.
const (
// controls how many NodeSpec updates NC can process concurrently.
cidrUpdateWorkers = 10
cidrUpdateQueueSize = 5000
// podCIDRUpdateRetry controls the number of retries of writing Node.Spec.PodCIDR update.
podCIDRUpdateRetry = 5
)
var errCIDRRangeNoCIDRsRemaining = errors.New("CIDR allocation failed; there are no remaining CIDRs left to allocate in the accepted range")
var errCIDRRangeNoCIDRsRemaining = errors.New(
"CIDR allocation failed; there are no remaining CIDRs left to allocate in the accepted range")
type nodeAndCIDR struct {
cidr *net.IPNet
nodeName string
}
// CIDRAllocator is an interface implemented by things that know how to allocate/occupy/recycle CIDR for nodes.
// CIDRAllocatorType is the type of the allocator to use.
type CIDRAllocatorType string
const (
RangeAllocatorType CIDRAllocatorType = "RangeAllocator"
CloudAllocatorType CIDRAllocatorType = "CloudAllocator"
)
// CIDRAllocator is an interface implemented by things that know how to
// allocate/occupy/recycle CIDR for nodes.
type CIDRAllocator interface {
// AllocateOrOccupyCIDR looks at the given node, assigns it a valid
// CIDR if it doesn't currently have one or mark the CIDR as used if
// the node already have one.
AllocateOrOccupyCIDR(node *v1.Node) error
// ReleaseCIDR releases the CIDR of the removed node
ReleaseCIDR(node *v1.Node) error
}
type rangeAllocator struct {
client clientset.Interface
cidrs *cidrSet
clusterCIDR *net.IPNet
maxCIDRs int
// Channel that is used to pass updating Nodes with assigned CIDRs to the background
// This increases a throughput of CIDR assignment by not blocking on long operations.
nodeCIDRUpdateChannel chan nodeAndCIDR
recorder record.EventRecorder
// Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation
sync.Mutex
nodesInProcessing sets.String
}
// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDR for node
// Caller must ensure subNetMaskSize is not less than cluster CIDR mask size.
// Caller must always pass in a list of existing nodes so the new allocator
// can initialize its CIDR map. NodeList is only nil in testing.
func NewCIDRRangeAllocator(client clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) {
eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "cidrAllocator"})
eventBroadcaster.StartLogging(glog.Infof)
if client != nil {
glog.V(0).Infof("Sending events to api server.")
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(client.Core().RESTClient()).Events("")})
} else {
glog.Fatalf("kubeClient is nil when starting NodeController")
}
ra := &rangeAllocator{
client: client,
cidrs: newCIDRSet(clusterCIDR, subNetMaskSize),
clusterCIDR: clusterCIDR,
nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize),
recorder: recorder,
nodesInProcessing: sets.NewString(),
}
if serviceCIDR != nil {
ra.filterOutServiceRange(serviceCIDR)
} else {
glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.")
}
if nodeList != nil {
for _, node := range nodeList.Items {
if node.Spec.PodCIDR == "" {
glog.Infof("Node %v has no CIDR, ignoring", node.Name)
continue
} else {
glog.Infof("Node %v has CIDR %s, occupying it in CIDR map", node.Name, node.Spec.PodCIDR)
}
if err := ra.occupyCIDR(&node); err != nil {
// This will happen if:
// 1. We find garbage in the podCIDR field. Retrying is useless.
// 2. CIDR out of range: This means a node CIDR has changed.
// This error will keep crashing controller-manager.
return nil, err
}
}
}
for i := 0; i < cidrUpdateWorkers; i++ {
go func(stopChan <-chan struct{}) {
for {
select {
case workItem, ok := <-ra.nodeCIDRUpdateChannel:
if !ok {
glog.Warning("NodeCIDRUpdateChannel read returned false.")
return
}
ra.updateCIDRAllocation(workItem)
case <-stopChan:
return
}
}
}(wait.NeverStop)
}
return ra, nil
}
func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool {
r.Lock()
defer r.Unlock()
if r.nodesInProcessing.Has(nodeName) {
return false
}
r.nodesInProcessing.Insert(nodeName)
return true
}
func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) {
r.Lock()
defer r.Unlock()
r.nodesInProcessing.Delete(nodeName)
}
func (r *rangeAllocator) occupyCIDR(node *v1.Node) error {
defer r.removeNodeFromProcessing(node.Name)
if node.Spec.PodCIDR == "" {
return nil
}
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
if err != nil {
return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR)
}
if err := r.cidrs.occupy(podCIDR); err != nil {
return fmt.Errorf("failed to mark cidr as occupied: %v", err)
}
return nil
}
// AllocateOrOccupyCIDR looks at the given node, assigns it a valid CIDR
// if it doesn't currently have one or mark the CIDR as used if the node already have one.
// WARNING: If you're adding any return calls or defer any more work from this function
// you have to handle correctly nodesInProcessing.
func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
if node == nil {
return nil
}
if !r.insertNodeToProcessing(node.Name) {
glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name)
return nil
}
if node.Spec.PodCIDR != "" {
return r.occupyCIDR(node)
}
podCIDR, err := r.cidrs.allocateNext()
if err != nil {
r.removeNodeFromProcessing(node.Name)
recordNodeStatusChange(r.recorder, node, "CIDRNotAvailable")
return fmt.Errorf("failed to allocate cidr: %v", err)
}
glog.V(10).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR)
r.nodeCIDRUpdateChannel <- nodeAndCIDR{
nodeName: node.Name,
cidr: podCIDR,
}
return nil
}
// ReleaseCIDR releases the CIDR of the removed node
func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error {
if node == nil || node.Spec.PodCIDR == "" {
return nil
}
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
if err != nil {
return fmt.Errorf("Failed to parse CIDR %s on Node %v: %v", node.Spec.PodCIDR, node.Name, err)
}
glog.V(4).Infof("release CIDR %s", node.Spec.PodCIDR)
if err = r.cidrs.release(podCIDR); err != nil {
return fmt.Errorf("Error when releasing CIDR %v: %v", node.Spec.PodCIDR, err)
}
return err
}
// Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used,
// so that they won't be assignable.
func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) {
// Checks if service CIDR has a nonempty intersection with cluster CIDR. It is the case if either
// clusterCIDR contains serviceCIDR with clusterCIDR's Mask applied (this means that clusterCIDR contains serviceCIDR)
// or vice versa (which means that serviceCIDR contains clusterCIDR).
if !r.clusterCIDR.Contains(serviceCIDR.IP.Mask(r.clusterCIDR.Mask)) && !serviceCIDR.Contains(r.clusterCIDR.IP.Mask(serviceCIDR.Mask)) {
return
}
if err := r.cidrs.occupy(serviceCIDR); err != nil {
glog.Errorf("Error filtering out service cidr %v: %v", serviceCIDR, err)
}
}
// Assigns CIDR to Node and sends an update to the API server.
func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error {
var err error
var node *v1.Node
defer r.removeNodeFromProcessing(data.nodeName)
for rep := 0; rep < podCIDRUpdateRetry; rep++ {
// TODO: change it to using PATCH instead of full Node updates.
node, err = r.client.Core().Nodes().Get(data.nodeName, metav1.GetOptions{})
if err != nil {
glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err)
continue
}
if node.Spec.PodCIDR != "" {
glog.Errorf("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR)
if node.Spec.PodCIDR != data.cidr.String() {
if err := r.cidrs.release(data.cidr); err != nil {
glog.Errorf("Error when releasing CIDR %v", data.cidr.String())
}
}
return nil
}
node.Spec.PodCIDR = data.cidr.String()
if _, err := r.client.Core().Nodes().Update(node); err != nil {
glog.Errorf("Failed while updating Node.Spec.PodCIDR (%d retries left): %v", podCIDRUpdateRetry-rep-1, err)
} else {
break
}
}
if err != nil {
recordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed")
// We accept the fact that we may leek CIDRs here. This is safer than releasing
// them in case when we don't know if request went through.
// NodeController restart will return all falsely allocated CIDRs to the pool.
if !apierrors.IsServerTimeout(err) {
glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err)
if releaseErr := r.cidrs.release(data.cidr); releaseErr != nil {
glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr)
}
}
}
return err
}

View File

@ -0,0 +1,143 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package node
import (
"fmt"
"sync"
"github.com/golang/glog"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
clientv1 "k8s.io/client-go/pkg/api/v1"
"k8s.io/client-go/tools/record"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/cloudprovider/providers/gce"
nodeutil "k8s.io/kubernetes/pkg/util/node"
)
// cloudCIDRAllocator allocates node CIDRs according to IP address aliases
// assigned by the cloud provider. In this case, the allocation and
// deallocation is delegated to the external provider, and the controller
// merely takes the assignment and updates the node spec.
type cloudCIDRAllocator struct {
lock sync.Mutex
client clientset.Interface
cloud *gce.GCECloud
recorder record.EventRecorder
}
var _ CIDRAllocator = (*cloudCIDRAllocator)(nil)
func NewCloudCIDRAllocator(
client clientset.Interface,
cloud cloudprovider.Interface) (ca CIDRAllocator, err error) {
gceCloud, ok := cloud.(*gce.GCECloud)
if !ok {
err = fmt.Errorf("cloudCIDRAllocator does not support %v provider", cloud.ProviderName())
return
}
ca = &cloudCIDRAllocator{
client: client,
cloud: gceCloud,
recorder: record.NewBroadcaster().NewRecorder(
api.Scheme,
clientv1.EventSource{Component: "cidrAllocator"}),
}
glog.V(0).Infof("Using cloud CIDR allocator (provider: %v)", cloud.ProviderName())
return
}
func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
glog.V(2).Infof("Updating PodCIDR for node %v", node.Name)
cidrs, err := ca.cloud.AliasRanges(types.NodeName(node.Name))
if err != nil {
recordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable")
return fmt.Errorf("failed to allocate cidr: %v", err)
}
if len(cidrs) == 0 {
recordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable")
glog.V(2).Infof("Node %v has no CIDRs", node.Name)
return fmt.Errorf("failed to allocate cidr (none exist)")
}
node, err = ca.client.Core().Nodes().Get(node.Name, metav1.GetOptions{})
if err != nil {
glog.Errorf("Could not get Node object from Kubernetes: %v", err)
return err
}
podCIDR := cidrs[0]
if node.Spec.PodCIDR != "" {
if node.Spec.PodCIDR == podCIDR {
glog.V(3).Infof("Node %v has PodCIDR %v", node.Name, podCIDR)
return nil
}
glog.Errorf("PodCIDR cannot be reassigned, node %v spec has %v, but cloud provider has assigned %v",
node.Name, node.Spec.PodCIDR, podCIDR)
// We fall through and set the CIDR despite this error. This
// implements the same logic as implemented in the
// rangeAllocator.
//
// See https://github.com/kubernetes/kubernetes/pull/42147#discussion_r103357248
}
node.Spec.PodCIDR = cidrs[0]
if _, err := ca.client.Core().Nodes().Update(node); err == nil {
glog.V(2).Infof("Node %v PodCIDR set to %v", node.Name, podCIDR)
} else {
glog.Errorf("Could not update node %v PodCIDR to %v: %v",
node.Name, podCIDR, err)
return err
}
err = nodeutil.SetNodeCondition(ca.client, types.NodeName(node.Name), v1.NodeCondition{
Type: v1.NodeNetworkUnavailable,
Status: v1.ConditionFalse,
Reason: "RouteCreated",
Message: "NodeController create implicit route",
LastTransitionTime: metav1.Now(),
})
if err != nil {
glog.Errorf("Error setting route status for node %v: %v",
node.Name, err)
}
return err
}
func (ca *cloudCIDRAllocator) ReleaseCIDR(node *v1.Node) error {
glog.V(2).Infof("Node %v PodCIDR (%v) will be released by external cloud provider (not managed by controller)",
node.Name, node.Spec.PodCIDR)
return nil
}

View File

@ -109,11 +109,13 @@ type nodeStatusData struct {
type NodeController struct {
allocateNodeCIDRs bool
cloud cloudprovider.Interface
clusterCIDR *net.IPNet
serviceCIDR *net.IPNet
knownNodeSet map[string]*v1.Node
kubeClient clientset.Interface
allocatorType CIDRAllocatorType
cloud cloudprovider.Interface
clusterCIDR *net.IPNet
serviceCIDR *net.IPNet
knownNodeSet map[string]*v1.Node
kubeClient clientset.Interface
// Method for easy mocking in unittest.
lookupIP func(host string) ([]net.IP, error)
// Value used if sync_nodes_status=False. NodeController will not proactively
@ -162,9 +164,8 @@ type NodeController struct {
podInformerSynced cache.InformerSynced
// allocate/recycle CIDRs for node if allocateNodeCIDRs == true
cidrAllocator CIDRAllocator
// manages taints
taintManager *NoExecuteTaintManager
forcefullyDeletePod func(*v1.Pod) error
@ -210,6 +211,7 @@ func NewNodeController(
serviceCIDR *net.IPNet,
nodeCIDRMaskSize int,
allocateNodeCIDRs bool,
allocatorType CIDRAllocatorType,
runTaintManager bool,
useTaintBasedEvictions bool) (*NodeController, error) {
eventBroadcaster := record.NewBroadcaster()
@ -254,6 +256,7 @@ func NewNodeController(
clusterCIDR: clusterCIDR,
serviceCIDR: serviceCIDR,
allocateNodeCIDRs: allocateNodeCIDRs,
allocatorType: allocatorType,
forcefullyDeletePod: func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) },
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
evictionLimiterQPS: evictionLimiterQPS,
@ -309,7 +312,6 @@ func NewNodeController(
})
nc.podInformerSynced = podInformer.Informer().HasSynced
nodeEventHandlerFuncs := cache.ResourceEventHandlerFuncs{}
if nc.allocateNodeCIDRs {
var nodeList *v1.NodeList
var err error
@ -328,147 +330,32 @@ func NewNodeController(
}); pollErr != nil {
return nil, fmt.Errorf("Failed to list all nodes in %v, cannot proceed without updating CIDR map", apiserverStartupGracePeriod)
}
nc.cidrAllocator, err = NewCIDRRangeAllocator(kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList)
switch nc.allocatorType {
case RangeAllocatorType:
nc.cidrAllocator, err = NewCIDRRangeAllocator(
kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList)
case CloudAllocatorType:
nc.cidrAllocator, err = NewCloudCIDRAllocator(kubeClient, cloud)
default:
return nil, fmt.Errorf("Invalid CIDR allocator type: %v", nc.allocatorType)
}
if err != nil {
return nil, err
}
nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{
AddFunc: func(originalObj interface{}) {
obj, err := api.Scheme.DeepCopy(originalObj)
if err != nil {
utilruntime.HandleError(err)
return
}
node := obj.(*v1.Node)
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(nil, node)
}
},
UpdateFunc: func(oldNode, newNode interface{}) {
node := newNode.(*v1.Node)
prevNode := oldNode.(*v1.Node)
// If the PodCIDR is not empty we either:
// - already processed a Node that already had a CIDR after NC restarted
// (cidr is marked as used),
// - already processed a Node successfully and allocated a CIDR for it
// (cidr is marked as used),
// - already processed a Node but we did saw a "timeout" response and
// request eventually got through in this case we haven't released
// the allocated CIDR (cidr is still marked as used).
// There's a possible error here:
// - NC sees a new Node and assigns a CIDR X to it,
// - Update Node call fails with a timeout,
// - Node is updated by some other component, NC sees an update and
// assigns CIDR Y to the Node,
// - Both CIDR X and CIDR Y are marked as used in the local cache,
// even though Node sees only CIDR Y
// The problem here is that in in-memory cache we see CIDR X as marked,
// which prevents it from being assigned to any new node. The cluster
// state is correct.
// Restart of NC fixes the issue.
if node.Spec.PodCIDR == "" {
nodeCopy, err := api.Scheme.Copy(node)
if err != nil {
utilruntime.HandleError(err)
return
}
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil {
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
}
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(prevNode, node)
}
},
DeleteFunc: func(originalObj interface{}) {
obj, err := api.Scheme.DeepCopy(originalObj)
if err != nil {
utilruntime.HandleError(err)
return
}
node, isNode := obj.(*v1.Node)
// We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692
if !isNode {
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
glog.Errorf("Received unexpected object: %v", obj)
return
}
node, ok = deletedState.Obj.(*v1.Node)
if !ok {
glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
return
}
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(node, nil)
}
if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil {
glog.Errorf("Error releasing CIDR: %v", err)
}
},
}
} else {
nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{
AddFunc: func(originalObj interface{}) {
obj, err := api.Scheme.DeepCopy(originalObj)
if err != nil {
utilruntime.HandleError(err)
return
}
node := obj.(*v1.Node)
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(nil, node)
}
},
UpdateFunc: func(oldNode, newNode interface{}) {
node := newNode.(*v1.Node)
prevNode := oldNode.(*v1.Node)
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(prevNode, node)
}
},
DeleteFunc: func(originalObj interface{}) {
obj, err := api.Scheme.DeepCopy(originalObj)
if err != nil {
utilruntime.HandleError(err)
return
}
node, isNode := obj.(*v1.Node)
// We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692
if !isNode {
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
glog.Errorf("Received unexpected object: %v", obj)
return
}
node, ok = deletedState.Obj.(*v1.Node)
if !ok {
glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
return
}
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(node, nil)
}
},
}
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: nc.onNodeAdd,
UpdateFunc: nc.onNodeUpdate,
DeleteFunc: nc.onNodeDelete,
})
}
if nc.runTaintManager {
nc.taintManager = NewNoExecuteTaintManager(kubeClient)
}
nodeInformer.Informer().AddEventHandler(nodeEventHandlerFuncs)
nc.nodeLister = nodeInformer.Lister()
nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
@ -546,6 +433,90 @@ func (nc *NodeController) doTaintingPass() {
}
}
func (nc *NodeController) onNodeAdd(originalObj interface{}) {
obj, err := api.Scheme.DeepCopy(originalObj)
if err != nil {
utilruntime.HandleError(err)
return
}
node := obj.(*v1.Node)
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil {
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(nil, node)
}
}
func (nc *NodeController) onNodeUpdate(oldNode, newNode interface{}) {
node := newNode.(*v1.Node)
prevNode := oldNode.(*v1.Node)
// If the PodCIDR is not empty we either:
// - already processed a Node that already had a CIDR after NC restarted
// (cidr is marked as used),
// - already processed a Node successfully and allocated a CIDR for it
// (cidr is marked as used),
// - already processed a Node but we did saw a "timeout" response and
// request eventually got through in this case we haven't released
// the allocated CIDR (cidr is still marked as used).
// There's a possible error here:
// - NC sees a new Node and assigns a CIDR X to it,
// - Update Node call fails with a timeout,
// - Node is updated by some other component, NC sees an update and
// assigns CIDR Y to the Node,
// - Both CIDR X and CIDR Y are marked as used in the local cache,
// even though Node sees only CIDR Y
// The problem here is that in in-memory cache we see CIDR X as marked,
// which prevents it from being assigned to any new node. The cluster
// state is correct.
// Restart of NC fixes the issue.
if node.Spec.PodCIDR == "" {
nodeCopy, err := api.Scheme.Copy(node)
if err != nil {
utilruntime.HandleError(err)
return
}
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil {
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
}
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(prevNode, node)
}
}
func (nc *NodeController) onNodeDelete(originalObj interface{}) {
obj, err := api.Scheme.DeepCopy(originalObj)
if err != nil {
utilruntime.HandleError(err)
return
}
node, isNode := obj.(*v1.Node)
// We can get DeletedFinalStateUnknown instead of *v1.Node here and
// we need to handle that correctly. #34692
if !isNode {
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
glog.Errorf("Received unexpected object: %v", obj)
return
}
node, ok = deletedState.Obj.(*v1.Node)
if !ok {
glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
return
}
}
if nc.taintManager != nil {
nc.taintManager.NodeUpdated(node, nil)
}
if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil {
glog.Errorf("Error releasing CIDR: %v", err)
}
}
// Run starts an asynchronous loop that monitors the status of cluster nodes.
func (nc *NodeController) Run() {
go func() {

View File

@ -101,6 +101,7 @@ func NewNodeControllerFromClient(
serviceCIDR,
nodeCIDRMaskSize,
allocateNodeCIDRs,
RangeAllocatorType,
useTaints,
useTaints,
)
@ -549,9 +550,22 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
}
for _, item := range table {
nodeController, _ := NewNodeControllerFromClient(nil, item.fakeNodeHandler,
evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, testUnhealtyThreshold, testNodeMonitorGracePeriod,
testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, nil, 0, false, false)
nodeController, _ := NewNodeControllerFromClient(
nil,
item.fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
testUnhealtyThreshold,
testNodeMonitorGracePeriod,
testNodeStartupGracePeriod,
testNodeMonitorPeriod,
nil,
nil,
0,
false,
false)
nodeController.now = func() metav1.Time { return fakeNow }
nodeController.recorder = testutil.NewFakeRecorder()
for _, ds := range item.daemonSets {

View File

@ -0,0 +1,262 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package node
import (
"fmt"
"net"
"sync"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
clientv1 "k8s.io/client-go/pkg/api/v1"
"k8s.io/client-go/tools/record"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
"github.com/golang/glog"
)
// TODO: figure out the good setting for those constants.
const (
// controls how many NodeSpec updates NC can process concurrently.
cidrUpdateWorkers = 10
cidrUpdateQueueSize = 5000
// podCIDRUpdateRetry controls the number of retries of writing Node.Spec.PodCIDR update.
podCIDRUpdateRetry = 5
)
type rangeAllocator struct {
client clientset.Interface
cidrs *cidrSet
clusterCIDR *net.IPNet
maxCIDRs int
// Channel that is used to pass updating Nodes with assigned CIDRs to the background
// This increases a throughput of CIDR assignment by not blocking on long operations.
nodeCIDRUpdateChannel chan nodeAndCIDR
recorder record.EventRecorder
// Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation
sync.Mutex
nodesInProcessing sets.String
}
// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDR for node
// Caller must ensure subNetMaskSize is not less than cluster CIDR mask size.
// Caller must always pass in a list of existing nodes so the new allocator
// can initialize its CIDR map. NodeList is only nil in testing.
func NewCIDRRangeAllocator(client clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) {
eventBroadcaster := record.NewBroadcaster()
recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "cidrAllocator"})
eventBroadcaster.StartLogging(glog.Infof)
if client != nil {
glog.V(0).Infof("Sending events to api server.")
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(client.Core().RESTClient()).Events("")})
} else {
glog.Fatalf("kubeClient is nil when starting NodeController")
}
ra := &rangeAllocator{
client: client,
cidrs: newCIDRSet(clusterCIDR, subNetMaskSize),
clusterCIDR: clusterCIDR,
nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize),
recorder: recorder,
nodesInProcessing: sets.NewString(),
}
if serviceCIDR != nil {
ra.filterOutServiceRange(serviceCIDR)
} else {
glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.")
}
if nodeList != nil {
for _, node := range nodeList.Items {
if node.Spec.PodCIDR == "" {
glog.Infof("Node %v has no CIDR, ignoring", node.Name)
continue
} else {
glog.Infof("Node %v has CIDR %s, occupying it in CIDR map",
node.Name, node.Spec.PodCIDR)
}
if err := ra.occupyCIDR(&node); err != nil {
// This will happen if:
// 1. We find garbage in the podCIDR field. Retrying is useless.
// 2. CIDR out of range: This means a node CIDR has changed.
// This error will keep crashing controller-manager.
return nil, err
}
}
}
for i := 0; i < cidrUpdateWorkers; i++ {
go func(stopChan <-chan struct{}) {
for {
select {
case workItem, ok := <-ra.nodeCIDRUpdateChannel:
if !ok {
glog.Warning("NodeCIDRUpdateChannel read returned false.")
return
}
ra.updateCIDRAllocation(workItem)
case <-stopChan:
return
}
}
}(wait.NeverStop)
}
return ra, nil
}
func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool {
r.Lock()
defer r.Unlock()
if r.nodesInProcessing.Has(nodeName) {
return false
}
r.nodesInProcessing.Insert(nodeName)
return true
}
func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) {
r.Lock()
defer r.Unlock()
r.nodesInProcessing.Delete(nodeName)
}
func (r *rangeAllocator) occupyCIDR(node *v1.Node) error {
defer r.removeNodeFromProcessing(node.Name)
if node.Spec.PodCIDR == "" {
return nil
}
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
if err != nil {
return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR)
}
if err := r.cidrs.occupy(podCIDR); err != nil {
return fmt.Errorf("failed to mark cidr as occupied: %v", err)
}
return nil
}
// WARNING: If you're adding any return calls or defer any more work from this
// function you have to handle correctly nodesInProcessing.
func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
if node == nil {
return nil
}
if !r.insertNodeToProcessing(node.Name) {
glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name)
return nil
}
if node.Spec.PodCIDR != "" {
return r.occupyCIDR(node)
}
podCIDR, err := r.cidrs.allocateNext()
if err != nil {
r.removeNodeFromProcessing(node.Name)
recordNodeStatusChange(r.recorder, node, "CIDRNotAvailable")
return fmt.Errorf("failed to allocate cidr: %v", err)
}
glog.V(10).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR)
r.nodeCIDRUpdateChannel <- nodeAndCIDR{
nodeName: node.Name,
cidr: podCIDR,
}
return nil
}
func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error {
if node == nil || node.Spec.PodCIDR == "" {
return nil
}
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
if err != nil {
return fmt.Errorf("Failed to parse CIDR %s on Node %v: %v", node.Spec.PodCIDR, node.Name, err)
}
glog.V(4).Infof("release CIDR %s", node.Spec.PodCIDR)
if err = r.cidrs.release(podCIDR); err != nil {
return fmt.Errorf("Error when releasing CIDR %v: %v", node.Spec.PodCIDR, err)
}
return err
}
// Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used,
// so that they won't be assignable.
func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) {
// Checks if service CIDR has a nonempty intersection with cluster
// CIDR. It is the case if either clusterCIDR contains serviceCIDR with
// clusterCIDR's Mask applied (this means that clusterCIDR contains
// serviceCIDR) or vice versa (which means that serviceCIDR contains
// clusterCIDR).
if !r.clusterCIDR.Contains(serviceCIDR.IP.Mask(r.clusterCIDR.Mask)) && !serviceCIDR.Contains(r.clusterCIDR.IP.Mask(serviceCIDR.Mask)) {
return
}
if err := r.cidrs.occupy(serviceCIDR); err != nil {
glog.Errorf("Error filtering out service cidr %v: %v", serviceCIDR, err)
}
}
// Assigns CIDR to Node and sends an update to the API server.
func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error {
var err error
var node *v1.Node
defer r.removeNodeFromProcessing(data.nodeName)
for rep := 0; rep < podCIDRUpdateRetry; rep++ {
// TODO: change it to using PATCH instead of full Node updates.
node, err = r.client.Core().Nodes().Get(data.nodeName, metav1.GetOptions{})
if err != nil {
glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err)
continue
}
if node.Spec.PodCIDR != "" {
glog.Errorf("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR)
if node.Spec.PodCIDR != data.cidr.String() {
if err := r.cidrs.release(data.cidr); err != nil {
glog.Errorf("Error when releasing CIDR %v", data.cidr.String())
}
}
return nil
}
node.Spec.PodCIDR = data.cidr.String()
if _, err := r.client.Core().Nodes().Update(node); err != nil {
glog.Errorf("Failed while updating Node.Spec.PodCIDR (%d retries left): %v", podCIDRUpdateRetry-rep-1, err)
} else {
break
}
}
if err != nil {
recordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed")
// We accept the fact that we may leek CIDRs here. This is safer than releasing
// them in case when we don't know if request went through.
// NodeController restart will return all falsely allocated CIDRs to the pool.
if !apierrors.IsServerTimeout(err) {
glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err)
if releaseErr := r.cidrs.release(data.cidr); releaseErr != nil {
glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr)
}
}
}
return err
}

View File

@ -168,7 +168,7 @@ func init() {
ObjectMeta: metav1.ObjectMeta{Name: saRolePrefix + "node-controller"},
Rules: []rbac.PolicyRule{
rbac.NewRule("get", "list", "update", "delete", "patch").Groups(legacyGroup).Resources("nodes").RuleOrDie(),
rbac.NewRule("update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(),
rbac.NewRule("patch", "update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(),
// used for pod eviction
rbac.NewRule("update").Groups(legacyGroup).Resources("pods/status").RuleOrDie(),
rbac.NewRule("list", "delete").Groups(legacyGroup).Resources("pods").RuleOrDie(),

View File

@ -561,6 +561,7 @@ items:
resources:
- nodes/status
verbs:
- patch
- update
- apiGroups:
- ""