mirror of https://github.com/k3s-io/k3s
Adds support for PodCIDR allocation from the GCE cloud provider
If CIDRAllocatorType is set to `CloudCIDRAllocator`, then allocation of CIDR allocation instead is done by the external cloud provider and the node controller is only responsible for reflecting the allocation into the node spec. - Splits off the rangeAllocator from the cidr_allocator.go file. - Adds cloudCIDRAllocator, which is used when the cloud provider allocates the CIDR ranges externally. (GCE support only) - Updates RBAC permission for node controller to include PATCHpull/6/head
parent
345c65847f
commit
f61590c221
|
@ -477,6 +477,7 @@ func StartControllers(controllers map[string]InitFunc, s *options.CMServer, root
|
|||
serviceCIDR,
|
||||
int(s.NodeCIDRMaskSize),
|
||||
s.AllocateNodeCIDRs,
|
||||
nodecontroller.CIDRAllocatorType(s.CIDRAllocatorType),
|
||||
s.EnableTaintManager,
|
||||
utilfeature.DefaultFeatureGate.Enabled(features.TaintBasedEvictions),
|
||||
)
|
||||
|
|
|
@ -188,7 +188,10 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled
|
|||
fs.StringVar(&s.ClusterCIDR, "cluster-cidr", s.ClusterCIDR, "CIDR Range for Pods in cluster.")
|
||||
fs.StringVar(&s.ServiceCIDR, "service-cluster-ip-range", s.ServiceCIDR, "CIDR Range for Services in cluster.")
|
||||
fs.Int32Var(&s.NodeCIDRMaskSize, "node-cidr-mask-size", s.NodeCIDRMaskSize, "Mask size for node cidr in cluster.")
|
||||
fs.BoolVar(&s.AllocateNodeCIDRs, "allocate-node-cidrs", false, "Should CIDRs for Pods be allocated and set on the cloud provider.")
|
||||
fs.BoolVar(&s.AllocateNodeCIDRs, "allocate-node-cidrs", false,
|
||||
"Should CIDRs for Pods be allocated and set on the cloud provider.")
|
||||
fs.StringVar(&s.CIDRAllocatorType, "cidr-allocator-type", "RangeAllocator",
|
||||
"Type of CIDR allocator to use")
|
||||
fs.BoolVar(&s.ConfigureCloudRoutes, "configure-cloud-routes", true, "Should CIDRs allocated by allocate-node-cidrs be configured on the cloud provider.")
|
||||
fs.StringVar(&s.Master, "master", s.Master, "The address of the Kubernetes API server (overrides any value in kubeconfig)")
|
||||
fs.StringVar(&s.Kubeconfig, "kubeconfig", s.Kubeconfig, "Path to kubeconfig file with authorization and master location information.")
|
||||
|
|
|
@ -794,9 +794,11 @@ type KubeControllerManagerConfiguration struct {
|
|||
ServiceCIDR string
|
||||
// NodeCIDRMaskSize is the mask size for node cidr in cluster.
|
||||
NodeCIDRMaskSize int32
|
||||
// allocateNodeCIDRs enables CIDRs for Pods to be allocated and, if
|
||||
// AllocateNodeCIDRs enables CIDRs for Pods to be allocated and, if
|
||||
// ConfigureCloudRoutes is true, to be set on the cloud provider.
|
||||
AllocateNodeCIDRs bool
|
||||
// CIDRAllocatorType determines what kind of pod CIDR allocator will be used.
|
||||
CIDRAllocatorType string
|
||||
// configureCloudRoutes enables CIDRs allocated with allocateNodeCIDRs
|
||||
// to be configured on the cloud provider.
|
||||
ConfigureCloudRoutes bool
|
||||
|
|
|
@ -217,10 +217,10 @@ func (gce *GCECloud) CurrentNodeName(hostname string) (types.NodeName, error) {
|
|||
return types.NodeName(hostname), nil
|
||||
}
|
||||
|
||||
// PodCIDRs returns a list of CIDR ranges that are assigned to the
|
||||
// AliasRanges returns a list of CIDR ranges that are assigned to the
|
||||
// `node` for allocation to pods. Returns a list of the form
|
||||
// "<ip>/<netmask>".
|
||||
func (gce *GCECloud) PodCIDRs(nodeName types.NodeName) (cidrs []string, err error) {
|
||||
func (gce *GCECloud) AliasRanges(nodeName types.NodeName) (cidrs []string, err error) {
|
||||
var instance *gceInstance
|
||||
instance, err = gce.getInstanceByName(mapNodeNameToInstanceName(nodeName))
|
||||
if err != nil {
|
||||
|
|
|
@ -8,56 +8,6 @@ load(
|
|||
"go_test",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"cidr_allocator.go",
|
||||
"cidr_set.go",
|
||||
"controller_utils.go",
|
||||
"doc.go",
|
||||
"metrics.go",
|
||||
"nodecontroller.go",
|
||||
"rate_limited_queue.go",
|
||||
"taint_controller.go",
|
||||
"timed_workers.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/client/clientset_generated/clientset:go_default_library",
|
||||
"//pkg/client/informers/informers_generated/externalversions/core/v1:go_default_library",
|
||||
"//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library",
|
||||
"//pkg/client/listers/core/v1:go_default_library",
|
||||
"//pkg/client/listers/extensions/v1beta1:go_default_library",
|
||||
"//pkg/cloudprovider:go_default_library",
|
||||
"//pkg/controller:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
"//pkg/util/metrics:go_default_library",
|
||||
"//pkg/util/node:go_default_library",
|
||||
"//pkg/util/system:go_default_library",
|
||||
"//pkg/util/version:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
"//vendor:github.com/prometheus/client_golang/prometheus",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/equality",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/errors",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/fields",
|
||||
"//vendor:k8s.io/apimachinery/pkg/labels",
|
||||
"//vendor:k8s.io/apimachinery/pkg/types",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/errors",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||
"//vendor:k8s.io/client-go/kubernetes/typed/core/v1",
|
||||
"//vendor:k8s.io/client-go/pkg/api/v1",
|
||||
"//vendor:k8s.io/client-go/tools/cache",
|
||||
"//vendor:k8s.io/client-go/tools/record",
|
||||
"//vendor:k8s.io/client-go/util/flowcontrol",
|
||||
"//vendor:k8s.io/client-go/util/workqueue",
|
||||
],
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = [
|
||||
|
@ -96,6 +46,59 @@ go_test(
|
|||
],
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"cidr_allocator.go",
|
||||
"cidr_set.go",
|
||||
"cloud_cidr_allocator.go",
|
||||
"controller_utils.go",
|
||||
"doc.go",
|
||||
"metrics.go",
|
||||
"nodecontroller.go",
|
||||
"range_allocator.go",
|
||||
"rate_limited_queue.go",
|
||||
"taint_controller.go",
|
||||
"timed_workers.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/client/clientset_generated/clientset:go_default_library",
|
||||
"//pkg/client/informers/informers_generated/externalversions/core/v1:go_default_library",
|
||||
"//pkg/client/informers/informers_generated/externalversions/extensions/v1beta1:go_default_library",
|
||||
"//pkg/client/listers/core/v1:go_default_library",
|
||||
"//pkg/client/listers/extensions/v1beta1:go_default_library",
|
||||
"//pkg/cloudprovider:go_default_library",
|
||||
"//pkg/cloudprovider/providers/gce:go_default_library",
|
||||
"//pkg/controller:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
"//pkg/util/metrics:go_default_library",
|
||||
"//pkg/util/node:go_default_library",
|
||||
"//pkg/util/system:go_default_library",
|
||||
"//pkg/util/version:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
"//vendor:github.com/prometheus/client_golang/prometheus",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/equality",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/errors",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/fields",
|
||||
"//vendor:k8s.io/apimachinery/pkg/labels",
|
||||
"//vendor:k8s.io/apimachinery/pkg/types",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/errors",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/runtime",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||
"//vendor:k8s.io/client-go/kubernetes/typed/core/v1",
|
||||
"//vendor:k8s.io/client-go/pkg/api/v1",
|
||||
"//vendor:k8s.io/client-go/tools/cache",
|
||||
"//vendor:k8s.io/client-go/tools/record",
|
||||
"//vendor:k8s.io/client-go/util/flowcontrol",
|
||||
"//vendor:k8s.io/client-go/util/workqueue",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
|
|
|
@ -18,259 +18,34 @@ package node
|
|||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"sync"
|
||||
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
clientv1 "k8s.io/client-go/pkg/api/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
|
||||
"github.com/golang/glog"
|
||||
v1 "k8s.io/kubernetes/pkg/api/v1"
|
||||
)
|
||||
|
||||
// TODO: figure out the good setting for those constants.
|
||||
const (
|
||||
// controls how many NodeSpec updates NC can process concurrently.
|
||||
cidrUpdateWorkers = 10
|
||||
cidrUpdateQueueSize = 5000
|
||||
// podCIDRUpdateRetry controls the number of retries of writing Node.Spec.PodCIDR update.
|
||||
podCIDRUpdateRetry = 5
|
||||
)
|
||||
|
||||
var errCIDRRangeNoCIDRsRemaining = errors.New("CIDR allocation failed; there are no remaining CIDRs left to allocate in the accepted range")
|
||||
var errCIDRRangeNoCIDRsRemaining = errors.New(
|
||||
"CIDR allocation failed; there are no remaining CIDRs left to allocate in the accepted range")
|
||||
|
||||
type nodeAndCIDR struct {
|
||||
cidr *net.IPNet
|
||||
nodeName string
|
||||
}
|
||||
|
||||
// CIDRAllocator is an interface implemented by things that know how to allocate/occupy/recycle CIDR for nodes.
|
||||
// CIDRAllocatorType is the type of the allocator to use.
|
||||
type CIDRAllocatorType string
|
||||
|
||||
const (
|
||||
RangeAllocatorType CIDRAllocatorType = "RangeAllocator"
|
||||
CloudAllocatorType CIDRAllocatorType = "CloudAllocator"
|
||||
)
|
||||
|
||||
// CIDRAllocator is an interface implemented by things that know how to
|
||||
// allocate/occupy/recycle CIDR for nodes.
|
||||
type CIDRAllocator interface {
|
||||
// AllocateOrOccupyCIDR looks at the given node, assigns it a valid
|
||||
// CIDR if it doesn't currently have one or mark the CIDR as used if
|
||||
// the node already have one.
|
||||
AllocateOrOccupyCIDR(node *v1.Node) error
|
||||
// ReleaseCIDR releases the CIDR of the removed node
|
||||
ReleaseCIDR(node *v1.Node) error
|
||||
}
|
||||
|
||||
type rangeAllocator struct {
|
||||
client clientset.Interface
|
||||
cidrs *cidrSet
|
||||
clusterCIDR *net.IPNet
|
||||
maxCIDRs int
|
||||
// Channel that is used to pass updating Nodes with assigned CIDRs to the background
|
||||
// This increases a throughput of CIDR assignment by not blocking on long operations.
|
||||
nodeCIDRUpdateChannel chan nodeAndCIDR
|
||||
recorder record.EventRecorder
|
||||
// Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation
|
||||
sync.Mutex
|
||||
nodesInProcessing sets.String
|
||||
}
|
||||
|
||||
// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDR for node
|
||||
// Caller must ensure subNetMaskSize is not less than cluster CIDR mask size.
|
||||
// Caller must always pass in a list of existing nodes so the new allocator
|
||||
// can initialize its CIDR map. NodeList is only nil in testing.
|
||||
func NewCIDRRangeAllocator(client clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) {
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "cidrAllocator"})
|
||||
eventBroadcaster.StartLogging(glog.Infof)
|
||||
if client != nil {
|
||||
glog.V(0).Infof("Sending events to api server.")
|
||||
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(client.Core().RESTClient()).Events("")})
|
||||
} else {
|
||||
glog.Fatalf("kubeClient is nil when starting NodeController")
|
||||
}
|
||||
|
||||
ra := &rangeAllocator{
|
||||
client: client,
|
||||
cidrs: newCIDRSet(clusterCIDR, subNetMaskSize),
|
||||
clusterCIDR: clusterCIDR,
|
||||
nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize),
|
||||
recorder: recorder,
|
||||
nodesInProcessing: sets.NewString(),
|
||||
}
|
||||
|
||||
if serviceCIDR != nil {
|
||||
ra.filterOutServiceRange(serviceCIDR)
|
||||
} else {
|
||||
glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.")
|
||||
}
|
||||
|
||||
if nodeList != nil {
|
||||
for _, node := range nodeList.Items {
|
||||
if node.Spec.PodCIDR == "" {
|
||||
glog.Infof("Node %v has no CIDR, ignoring", node.Name)
|
||||
continue
|
||||
} else {
|
||||
glog.Infof("Node %v has CIDR %s, occupying it in CIDR map", node.Name, node.Spec.PodCIDR)
|
||||
}
|
||||
if err := ra.occupyCIDR(&node); err != nil {
|
||||
// This will happen if:
|
||||
// 1. We find garbage in the podCIDR field. Retrying is useless.
|
||||
// 2. CIDR out of range: This means a node CIDR has changed.
|
||||
// This error will keep crashing controller-manager.
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
for i := 0; i < cidrUpdateWorkers; i++ {
|
||||
go func(stopChan <-chan struct{}) {
|
||||
for {
|
||||
select {
|
||||
case workItem, ok := <-ra.nodeCIDRUpdateChannel:
|
||||
if !ok {
|
||||
glog.Warning("NodeCIDRUpdateChannel read returned false.")
|
||||
return
|
||||
}
|
||||
ra.updateCIDRAllocation(workItem)
|
||||
case <-stopChan:
|
||||
return
|
||||
}
|
||||
}
|
||||
}(wait.NeverStop)
|
||||
}
|
||||
|
||||
return ra, nil
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
if r.nodesInProcessing.Has(nodeName) {
|
||||
return false
|
||||
}
|
||||
r.nodesInProcessing.Insert(nodeName)
|
||||
return true
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
r.nodesInProcessing.Delete(nodeName)
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) occupyCIDR(node *v1.Node) error {
|
||||
defer r.removeNodeFromProcessing(node.Name)
|
||||
if node.Spec.PodCIDR == "" {
|
||||
return nil
|
||||
}
|
||||
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR)
|
||||
}
|
||||
if err := r.cidrs.occupy(podCIDR); err != nil {
|
||||
return fmt.Errorf("failed to mark cidr as occupied: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// AllocateOrOccupyCIDR looks at the given node, assigns it a valid CIDR
|
||||
// if it doesn't currently have one or mark the CIDR as used if the node already have one.
|
||||
// WARNING: If you're adding any return calls or defer any more work from this function
|
||||
// you have to handle correctly nodesInProcessing.
|
||||
func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
|
||||
if node == nil {
|
||||
return nil
|
||||
}
|
||||
if !r.insertNodeToProcessing(node.Name) {
|
||||
glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name)
|
||||
return nil
|
||||
}
|
||||
if node.Spec.PodCIDR != "" {
|
||||
return r.occupyCIDR(node)
|
||||
}
|
||||
podCIDR, err := r.cidrs.allocateNext()
|
||||
if err != nil {
|
||||
r.removeNodeFromProcessing(node.Name)
|
||||
recordNodeStatusChange(r.recorder, node, "CIDRNotAvailable")
|
||||
return fmt.Errorf("failed to allocate cidr: %v", err)
|
||||
}
|
||||
|
||||
glog.V(10).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR)
|
||||
r.nodeCIDRUpdateChannel <- nodeAndCIDR{
|
||||
nodeName: node.Name,
|
||||
cidr: podCIDR,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReleaseCIDR releases the CIDR of the removed node
|
||||
func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error {
|
||||
if node == nil || node.Spec.PodCIDR == "" {
|
||||
return nil
|
||||
}
|
||||
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to parse CIDR %s on Node %v: %v", node.Spec.PodCIDR, node.Name, err)
|
||||
}
|
||||
|
||||
glog.V(4).Infof("release CIDR %s", node.Spec.PodCIDR)
|
||||
if err = r.cidrs.release(podCIDR); err != nil {
|
||||
return fmt.Errorf("Error when releasing CIDR %v: %v", node.Spec.PodCIDR, err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used,
|
||||
// so that they won't be assignable.
|
||||
func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) {
|
||||
// Checks if service CIDR has a nonempty intersection with cluster CIDR. It is the case if either
|
||||
// clusterCIDR contains serviceCIDR with clusterCIDR's Mask applied (this means that clusterCIDR contains serviceCIDR)
|
||||
// or vice versa (which means that serviceCIDR contains clusterCIDR).
|
||||
if !r.clusterCIDR.Contains(serviceCIDR.IP.Mask(r.clusterCIDR.Mask)) && !serviceCIDR.Contains(r.clusterCIDR.IP.Mask(serviceCIDR.Mask)) {
|
||||
return
|
||||
}
|
||||
|
||||
if err := r.cidrs.occupy(serviceCIDR); err != nil {
|
||||
glog.Errorf("Error filtering out service cidr %v: %v", serviceCIDR, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Assigns CIDR to Node and sends an update to the API server.
|
||||
func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error {
|
||||
var err error
|
||||
var node *v1.Node
|
||||
defer r.removeNodeFromProcessing(data.nodeName)
|
||||
for rep := 0; rep < podCIDRUpdateRetry; rep++ {
|
||||
// TODO: change it to using PATCH instead of full Node updates.
|
||||
node, err = r.client.Core().Nodes().Get(data.nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err)
|
||||
continue
|
||||
}
|
||||
if node.Spec.PodCIDR != "" {
|
||||
glog.Errorf("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR)
|
||||
if node.Spec.PodCIDR != data.cidr.String() {
|
||||
if err := r.cidrs.release(data.cidr); err != nil {
|
||||
glog.Errorf("Error when releasing CIDR %v", data.cidr.String())
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
node.Spec.PodCIDR = data.cidr.String()
|
||||
if _, err := r.client.Core().Nodes().Update(node); err != nil {
|
||||
glog.Errorf("Failed while updating Node.Spec.PodCIDR (%d retries left): %v", podCIDRUpdateRetry-rep-1, err)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
recordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed")
|
||||
// We accept the fact that we may leek CIDRs here. This is safer than releasing
|
||||
// them in case when we don't know if request went through.
|
||||
// NodeController restart will return all falsely allocated CIDRs to the pool.
|
||||
if !apierrors.IsServerTimeout(err) {
|
||||
glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err)
|
||||
if releaseErr := r.cidrs.release(data.cidr); releaseErr != nil {
|
||||
glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
Copyright 2016 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package node
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
|
||||
clientv1 "k8s.io/client-go/pkg/api/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
"k8s.io/kubernetes/pkg/cloudprovider"
|
||||
"k8s.io/kubernetes/pkg/cloudprovider/providers/gce"
|
||||
nodeutil "k8s.io/kubernetes/pkg/util/node"
|
||||
)
|
||||
|
||||
// cloudCIDRAllocator allocates node CIDRs according to IP address aliases
|
||||
// assigned by the cloud provider. In this case, the allocation and
|
||||
// deallocation is delegated to the external provider, and the controller
|
||||
// merely takes the assignment and updates the node spec.
|
||||
type cloudCIDRAllocator struct {
|
||||
lock sync.Mutex
|
||||
|
||||
client clientset.Interface
|
||||
cloud *gce.GCECloud
|
||||
|
||||
recorder record.EventRecorder
|
||||
}
|
||||
|
||||
var _ CIDRAllocator = (*cloudCIDRAllocator)(nil)
|
||||
|
||||
func NewCloudCIDRAllocator(
|
||||
client clientset.Interface,
|
||||
cloud cloudprovider.Interface) (ca CIDRAllocator, err error) {
|
||||
|
||||
gceCloud, ok := cloud.(*gce.GCECloud)
|
||||
if !ok {
|
||||
err = fmt.Errorf("cloudCIDRAllocator does not support %v provider", cloud.ProviderName())
|
||||
return
|
||||
}
|
||||
|
||||
ca = &cloudCIDRAllocator{
|
||||
client: client,
|
||||
cloud: gceCloud,
|
||||
recorder: record.NewBroadcaster().NewRecorder(
|
||||
api.Scheme,
|
||||
clientv1.EventSource{Component: "cidrAllocator"}),
|
||||
}
|
||||
|
||||
glog.V(0).Infof("Using cloud CIDR allocator (provider: %v)", cloud.ProviderName())
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
|
||||
glog.V(2).Infof("Updating PodCIDR for node %v", node.Name)
|
||||
|
||||
cidrs, err := ca.cloud.AliasRanges(types.NodeName(node.Name))
|
||||
|
||||
if err != nil {
|
||||
recordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable")
|
||||
return fmt.Errorf("failed to allocate cidr: %v", err)
|
||||
}
|
||||
|
||||
if len(cidrs) == 0 {
|
||||
recordNodeStatusChange(ca.recorder, node, "CIDRNotAvailable")
|
||||
glog.V(2).Infof("Node %v has no CIDRs", node.Name)
|
||||
return fmt.Errorf("failed to allocate cidr (none exist)")
|
||||
}
|
||||
|
||||
node, err = ca.client.Core().Nodes().Get(node.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
glog.Errorf("Could not get Node object from Kubernetes: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
podCIDR := cidrs[0]
|
||||
|
||||
if node.Spec.PodCIDR != "" {
|
||||
if node.Spec.PodCIDR == podCIDR {
|
||||
glog.V(3).Infof("Node %v has PodCIDR %v", node.Name, podCIDR)
|
||||
return nil
|
||||
}
|
||||
glog.Errorf("PodCIDR cannot be reassigned, node %v spec has %v, but cloud provider has assigned %v",
|
||||
node.Name, node.Spec.PodCIDR, podCIDR)
|
||||
// We fall through and set the CIDR despite this error. This
|
||||
// implements the same logic as implemented in the
|
||||
// rangeAllocator.
|
||||
//
|
||||
// See https://github.com/kubernetes/kubernetes/pull/42147#discussion_r103357248
|
||||
}
|
||||
|
||||
node.Spec.PodCIDR = cidrs[0]
|
||||
if _, err := ca.client.Core().Nodes().Update(node); err == nil {
|
||||
glog.V(2).Infof("Node %v PodCIDR set to %v", node.Name, podCIDR)
|
||||
} else {
|
||||
glog.Errorf("Could not update node %v PodCIDR to %v: %v",
|
||||
node.Name, podCIDR, err)
|
||||
return err
|
||||
}
|
||||
|
||||
err = nodeutil.SetNodeCondition(ca.client, types.NodeName(node.Name), v1.NodeCondition{
|
||||
Type: v1.NodeNetworkUnavailable,
|
||||
Status: v1.ConditionFalse,
|
||||
Reason: "RouteCreated",
|
||||
Message: "NodeController create implicit route",
|
||||
LastTransitionTime: metav1.Now(),
|
||||
})
|
||||
if err != nil {
|
||||
glog.Errorf("Error setting route status for node %v: %v",
|
||||
node.Name, err)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (ca *cloudCIDRAllocator) ReleaseCIDR(node *v1.Node) error {
|
||||
glog.V(2).Infof("Node %v PodCIDR (%v) will be released by external cloud provider (not managed by controller)",
|
||||
node.Name, node.Spec.PodCIDR)
|
||||
return nil
|
||||
}
|
|
@ -109,11 +109,13 @@ type nodeStatusData struct {
|
|||
|
||||
type NodeController struct {
|
||||
allocateNodeCIDRs bool
|
||||
cloud cloudprovider.Interface
|
||||
clusterCIDR *net.IPNet
|
||||
serviceCIDR *net.IPNet
|
||||
knownNodeSet map[string]*v1.Node
|
||||
kubeClient clientset.Interface
|
||||
allocatorType CIDRAllocatorType
|
||||
|
||||
cloud cloudprovider.Interface
|
||||
clusterCIDR *net.IPNet
|
||||
serviceCIDR *net.IPNet
|
||||
knownNodeSet map[string]*v1.Node
|
||||
kubeClient clientset.Interface
|
||||
// Method for easy mocking in unittest.
|
||||
lookupIP func(host string) ([]net.IP, error)
|
||||
// Value used if sync_nodes_status=False. NodeController will not proactively
|
||||
|
@ -162,9 +164,8 @@ type NodeController struct {
|
|||
|
||||
podInformerSynced cache.InformerSynced
|
||||
|
||||
// allocate/recycle CIDRs for node if allocateNodeCIDRs == true
|
||||
cidrAllocator CIDRAllocator
|
||||
// manages taints
|
||||
|
||||
taintManager *NoExecuteTaintManager
|
||||
|
||||
forcefullyDeletePod func(*v1.Pod) error
|
||||
|
@ -210,6 +211,7 @@ func NewNodeController(
|
|||
serviceCIDR *net.IPNet,
|
||||
nodeCIDRMaskSize int,
|
||||
allocateNodeCIDRs bool,
|
||||
allocatorType CIDRAllocatorType,
|
||||
runTaintManager bool,
|
||||
useTaintBasedEvictions bool) (*NodeController, error) {
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
|
@ -254,6 +256,7 @@ func NewNodeController(
|
|||
clusterCIDR: clusterCIDR,
|
||||
serviceCIDR: serviceCIDR,
|
||||
allocateNodeCIDRs: allocateNodeCIDRs,
|
||||
allocatorType: allocatorType,
|
||||
forcefullyDeletePod: func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) },
|
||||
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
|
||||
evictionLimiterQPS: evictionLimiterQPS,
|
||||
|
@ -309,7 +312,6 @@ func NewNodeController(
|
|||
})
|
||||
nc.podInformerSynced = podInformer.Informer().HasSynced
|
||||
|
||||
nodeEventHandlerFuncs := cache.ResourceEventHandlerFuncs{}
|
||||
if nc.allocateNodeCIDRs {
|
||||
var nodeList *v1.NodeList
|
||||
var err error
|
||||
|
@ -328,147 +330,32 @@ func NewNodeController(
|
|||
}); pollErr != nil {
|
||||
return nil, fmt.Errorf("Failed to list all nodes in %v, cannot proceed without updating CIDR map", apiserverStartupGracePeriod)
|
||||
}
|
||||
nc.cidrAllocator, err = NewCIDRRangeAllocator(kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList)
|
||||
|
||||
switch nc.allocatorType {
|
||||
case RangeAllocatorType:
|
||||
nc.cidrAllocator, err = NewCIDRRangeAllocator(
|
||||
kubeClient, clusterCIDR, serviceCIDR, nodeCIDRMaskSize, nodeList)
|
||||
case CloudAllocatorType:
|
||||
nc.cidrAllocator, err = NewCloudCIDRAllocator(kubeClient, cloud)
|
||||
default:
|
||||
return nil, fmt.Errorf("Invalid CIDR allocator type: %v", nc.allocatorType)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{
|
||||
AddFunc: func(originalObj interface{}) {
|
||||
obj, err := api.Scheme.DeepCopy(originalObj)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
node := obj.(*v1.Node)
|
||||
|
||||
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil {
|
||||
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(nil, node)
|
||||
}
|
||||
},
|
||||
UpdateFunc: func(oldNode, newNode interface{}) {
|
||||
node := newNode.(*v1.Node)
|
||||
prevNode := oldNode.(*v1.Node)
|
||||
// If the PodCIDR is not empty we either:
|
||||
// - already processed a Node that already had a CIDR after NC restarted
|
||||
// (cidr is marked as used),
|
||||
// - already processed a Node successfully and allocated a CIDR for it
|
||||
// (cidr is marked as used),
|
||||
// - already processed a Node but we did saw a "timeout" response and
|
||||
// request eventually got through in this case we haven't released
|
||||
// the allocated CIDR (cidr is still marked as used).
|
||||
// There's a possible error here:
|
||||
// - NC sees a new Node and assigns a CIDR X to it,
|
||||
// - Update Node call fails with a timeout,
|
||||
// - Node is updated by some other component, NC sees an update and
|
||||
// assigns CIDR Y to the Node,
|
||||
// - Both CIDR X and CIDR Y are marked as used in the local cache,
|
||||
// even though Node sees only CIDR Y
|
||||
// The problem here is that in in-memory cache we see CIDR X as marked,
|
||||
// which prevents it from being assigned to any new node. The cluster
|
||||
// state is correct.
|
||||
// Restart of NC fixes the issue.
|
||||
if node.Spec.PodCIDR == "" {
|
||||
nodeCopy, err := api.Scheme.Copy(node)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil {
|
||||
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
|
||||
}
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(prevNode, node)
|
||||
}
|
||||
},
|
||||
DeleteFunc: func(originalObj interface{}) {
|
||||
obj, err := api.Scheme.DeepCopy(originalObj)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
|
||||
node, isNode := obj.(*v1.Node)
|
||||
// We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692
|
||||
if !isNode {
|
||||
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
|
||||
if !ok {
|
||||
glog.Errorf("Received unexpected object: %v", obj)
|
||||
return
|
||||
}
|
||||
node, ok = deletedState.Obj.(*v1.Node)
|
||||
if !ok {
|
||||
glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
|
||||
return
|
||||
}
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(node, nil)
|
||||
}
|
||||
if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil {
|
||||
glog.Errorf("Error releasing CIDR: %v", err)
|
||||
}
|
||||
},
|
||||
}
|
||||
} else {
|
||||
nodeEventHandlerFuncs = cache.ResourceEventHandlerFuncs{
|
||||
AddFunc: func(originalObj interface{}) {
|
||||
obj, err := api.Scheme.DeepCopy(originalObj)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
node := obj.(*v1.Node)
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(nil, node)
|
||||
}
|
||||
},
|
||||
UpdateFunc: func(oldNode, newNode interface{}) {
|
||||
node := newNode.(*v1.Node)
|
||||
prevNode := oldNode.(*v1.Node)
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(prevNode, node)
|
||||
|
||||
}
|
||||
},
|
||||
DeleteFunc: func(originalObj interface{}) {
|
||||
obj, err := api.Scheme.DeepCopy(originalObj)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
|
||||
node, isNode := obj.(*v1.Node)
|
||||
// We can get DeletedFinalStateUnknown instead of *v1.Node here and we need to handle that correctly. #34692
|
||||
if !isNode {
|
||||
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
|
||||
if !ok {
|
||||
glog.Errorf("Received unexpected object: %v", obj)
|
||||
return
|
||||
}
|
||||
node, ok = deletedState.Obj.(*v1.Node)
|
||||
if !ok {
|
||||
glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
|
||||
return
|
||||
}
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(node, nil)
|
||||
}
|
||||
},
|
||||
}
|
||||
nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
|
||||
AddFunc: nc.onNodeAdd,
|
||||
UpdateFunc: nc.onNodeUpdate,
|
||||
DeleteFunc: nc.onNodeDelete,
|
||||
})
|
||||
}
|
||||
|
||||
if nc.runTaintManager {
|
||||
nc.taintManager = NewNoExecuteTaintManager(kubeClient)
|
||||
}
|
||||
|
||||
nodeInformer.Informer().AddEventHandler(nodeEventHandlerFuncs)
|
||||
nc.nodeLister = nodeInformer.Lister()
|
||||
nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
|
||||
|
||||
|
@ -546,6 +433,90 @@ func (nc *NodeController) doTaintingPass() {
|
|||
}
|
||||
}
|
||||
|
||||
func (nc *NodeController) onNodeAdd(originalObj interface{}) {
|
||||
obj, err := api.Scheme.DeepCopy(originalObj)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
node := obj.(*v1.Node)
|
||||
|
||||
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(node); err != nil {
|
||||
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(nil, node)
|
||||
}
|
||||
}
|
||||
|
||||
func (nc *NodeController) onNodeUpdate(oldNode, newNode interface{}) {
|
||||
node := newNode.(*v1.Node)
|
||||
prevNode := oldNode.(*v1.Node)
|
||||
// If the PodCIDR is not empty we either:
|
||||
// - already processed a Node that already had a CIDR after NC restarted
|
||||
// (cidr is marked as used),
|
||||
// - already processed a Node successfully and allocated a CIDR for it
|
||||
// (cidr is marked as used),
|
||||
// - already processed a Node but we did saw a "timeout" response and
|
||||
// request eventually got through in this case we haven't released
|
||||
// the allocated CIDR (cidr is still marked as used).
|
||||
// There's a possible error here:
|
||||
// - NC sees a new Node and assigns a CIDR X to it,
|
||||
// - Update Node call fails with a timeout,
|
||||
// - Node is updated by some other component, NC sees an update and
|
||||
// assigns CIDR Y to the Node,
|
||||
// - Both CIDR X and CIDR Y are marked as used in the local cache,
|
||||
// even though Node sees only CIDR Y
|
||||
// The problem here is that in in-memory cache we see CIDR X as marked,
|
||||
// which prevents it from being assigned to any new node. The cluster
|
||||
// state is correct.
|
||||
// Restart of NC fixes the issue.
|
||||
if node.Spec.PodCIDR == "" {
|
||||
nodeCopy, err := api.Scheme.Copy(node)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := nc.cidrAllocator.AllocateOrOccupyCIDR(nodeCopy.(*v1.Node)); err != nil {
|
||||
utilruntime.HandleError(fmt.Errorf("Error allocating CIDR: %v", err))
|
||||
}
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(prevNode, node)
|
||||
}
|
||||
}
|
||||
|
||||
func (nc *NodeController) onNodeDelete(originalObj interface{}) {
|
||||
obj, err := api.Scheme.DeepCopy(originalObj)
|
||||
if err != nil {
|
||||
utilruntime.HandleError(err)
|
||||
return
|
||||
}
|
||||
|
||||
node, isNode := obj.(*v1.Node)
|
||||
// We can get DeletedFinalStateUnknown instead of *v1.Node here and
|
||||
// we need to handle that correctly. #34692
|
||||
if !isNode {
|
||||
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
|
||||
if !ok {
|
||||
glog.Errorf("Received unexpected object: %v", obj)
|
||||
return
|
||||
}
|
||||
node, ok = deletedState.Obj.(*v1.Node)
|
||||
if !ok {
|
||||
glog.Errorf("DeletedFinalStateUnknown contained non-Node object: %v", deletedState.Obj)
|
||||
return
|
||||
}
|
||||
}
|
||||
if nc.taintManager != nil {
|
||||
nc.taintManager.NodeUpdated(node, nil)
|
||||
}
|
||||
if err := nc.cidrAllocator.ReleaseCIDR(node); err != nil {
|
||||
glog.Errorf("Error releasing CIDR: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts an asynchronous loop that monitors the status of cluster nodes.
|
||||
func (nc *NodeController) Run() {
|
||||
go func() {
|
||||
|
|
|
@ -101,6 +101,7 @@ func NewNodeControllerFromClient(
|
|||
serviceCIDR,
|
||||
nodeCIDRMaskSize,
|
||||
allocateNodeCIDRs,
|
||||
RangeAllocatorType,
|
||||
useTaints,
|
||||
useTaints,
|
||||
)
|
||||
|
@ -549,9 +550,22 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
|
|||
}
|
||||
|
||||
for _, item := range table {
|
||||
nodeController, _ := NewNodeControllerFromClient(nil, item.fakeNodeHandler,
|
||||
evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, testUnhealtyThreshold, testNodeMonitorGracePeriod,
|
||||
testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, nil, 0, false, false)
|
||||
nodeController, _ := NewNodeControllerFromClient(
|
||||
nil,
|
||||
item.fakeNodeHandler,
|
||||
evictionTimeout,
|
||||
testRateLimiterQPS,
|
||||
testRateLimiterQPS,
|
||||
testLargeClusterThreshold,
|
||||
testUnhealtyThreshold,
|
||||
testNodeMonitorGracePeriod,
|
||||
testNodeStartupGracePeriod,
|
||||
testNodeMonitorPeriod,
|
||||
nil,
|
||||
nil,
|
||||
0,
|
||||
false,
|
||||
false)
|
||||
nodeController.now = func() metav1.Time { return fakeNow }
|
||||
nodeController.recorder = testutil.NewFakeRecorder()
|
||||
for _, ds := range item.daemonSets {
|
||||
|
|
|
@ -0,0 +1,262 @@
|
|||
/*
|
||||
Copyright 2016 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package node
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"sync"
|
||||
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
|
||||
clientv1 "k8s.io/client-go/pkg/api/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// TODO: figure out the good setting for those constants.
|
||||
const (
|
||||
// controls how many NodeSpec updates NC can process concurrently.
|
||||
cidrUpdateWorkers = 10
|
||||
cidrUpdateQueueSize = 5000
|
||||
// podCIDRUpdateRetry controls the number of retries of writing Node.Spec.PodCIDR update.
|
||||
podCIDRUpdateRetry = 5
|
||||
)
|
||||
|
||||
type rangeAllocator struct {
|
||||
client clientset.Interface
|
||||
cidrs *cidrSet
|
||||
clusterCIDR *net.IPNet
|
||||
maxCIDRs int
|
||||
// Channel that is used to pass updating Nodes with assigned CIDRs to the background
|
||||
// This increases a throughput of CIDR assignment by not blocking on long operations.
|
||||
nodeCIDRUpdateChannel chan nodeAndCIDR
|
||||
recorder record.EventRecorder
|
||||
// Keep a set of nodes that are currectly being processed to avoid races in CIDR allocation
|
||||
sync.Mutex
|
||||
nodesInProcessing sets.String
|
||||
}
|
||||
|
||||
// NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDR for node
|
||||
// Caller must ensure subNetMaskSize is not less than cluster CIDR mask size.
|
||||
// Caller must always pass in a list of existing nodes so the new allocator
|
||||
// can initialize its CIDR map. NodeList is only nil in testing.
|
||||
func NewCIDRRangeAllocator(client clientset.Interface, clusterCIDR *net.IPNet, serviceCIDR *net.IPNet, subNetMaskSize int, nodeList *v1.NodeList) (CIDRAllocator, error) {
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "cidrAllocator"})
|
||||
eventBroadcaster.StartLogging(glog.Infof)
|
||||
if client != nil {
|
||||
glog.V(0).Infof("Sending events to api server.")
|
||||
eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: v1core.New(client.Core().RESTClient()).Events("")})
|
||||
} else {
|
||||
glog.Fatalf("kubeClient is nil when starting NodeController")
|
||||
}
|
||||
|
||||
ra := &rangeAllocator{
|
||||
client: client,
|
||||
cidrs: newCIDRSet(clusterCIDR, subNetMaskSize),
|
||||
clusterCIDR: clusterCIDR,
|
||||
nodeCIDRUpdateChannel: make(chan nodeAndCIDR, cidrUpdateQueueSize),
|
||||
recorder: recorder,
|
||||
nodesInProcessing: sets.NewString(),
|
||||
}
|
||||
|
||||
if serviceCIDR != nil {
|
||||
ra.filterOutServiceRange(serviceCIDR)
|
||||
} else {
|
||||
glog.V(0).Info("No Service CIDR provided. Skipping filtering out service addresses.")
|
||||
}
|
||||
|
||||
if nodeList != nil {
|
||||
for _, node := range nodeList.Items {
|
||||
if node.Spec.PodCIDR == "" {
|
||||
glog.Infof("Node %v has no CIDR, ignoring", node.Name)
|
||||
continue
|
||||
} else {
|
||||
glog.Infof("Node %v has CIDR %s, occupying it in CIDR map",
|
||||
node.Name, node.Spec.PodCIDR)
|
||||
}
|
||||
if err := ra.occupyCIDR(&node); err != nil {
|
||||
// This will happen if:
|
||||
// 1. We find garbage in the podCIDR field. Retrying is useless.
|
||||
// 2. CIDR out of range: This means a node CIDR has changed.
|
||||
// This error will keep crashing controller-manager.
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
for i := 0; i < cidrUpdateWorkers; i++ {
|
||||
go func(stopChan <-chan struct{}) {
|
||||
for {
|
||||
select {
|
||||
case workItem, ok := <-ra.nodeCIDRUpdateChannel:
|
||||
if !ok {
|
||||
glog.Warning("NodeCIDRUpdateChannel read returned false.")
|
||||
return
|
||||
}
|
||||
ra.updateCIDRAllocation(workItem)
|
||||
case <-stopChan:
|
||||
return
|
||||
}
|
||||
}
|
||||
}(wait.NeverStop)
|
||||
}
|
||||
|
||||
return ra, nil
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
if r.nodesInProcessing.Has(nodeName) {
|
||||
return false
|
||||
}
|
||||
r.nodesInProcessing.Insert(nodeName)
|
||||
return true
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
r.nodesInProcessing.Delete(nodeName)
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) occupyCIDR(node *v1.Node) error {
|
||||
defer r.removeNodeFromProcessing(node.Name)
|
||||
if node.Spec.PodCIDR == "" {
|
||||
return nil
|
||||
}
|
||||
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse node %s, CIDR %s", node.Name, node.Spec.PodCIDR)
|
||||
}
|
||||
if err := r.cidrs.occupy(podCIDR); err != nil {
|
||||
return fmt.Errorf("failed to mark cidr as occupied: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// WARNING: If you're adding any return calls or defer any more work from this
|
||||
// function you have to handle correctly nodesInProcessing.
|
||||
func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error {
|
||||
if node == nil {
|
||||
return nil
|
||||
}
|
||||
if !r.insertNodeToProcessing(node.Name) {
|
||||
glog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name)
|
||||
return nil
|
||||
}
|
||||
if node.Spec.PodCIDR != "" {
|
||||
return r.occupyCIDR(node)
|
||||
}
|
||||
podCIDR, err := r.cidrs.allocateNext()
|
||||
if err != nil {
|
||||
r.removeNodeFromProcessing(node.Name)
|
||||
recordNodeStatusChange(r.recorder, node, "CIDRNotAvailable")
|
||||
return fmt.Errorf("failed to allocate cidr: %v", err)
|
||||
}
|
||||
|
||||
glog.V(10).Infof("Putting node %s with CIDR %s into the work queue", node.Name, podCIDR)
|
||||
r.nodeCIDRUpdateChannel <- nodeAndCIDR{
|
||||
nodeName: node.Name,
|
||||
cidr: podCIDR,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error {
|
||||
if node == nil || node.Spec.PodCIDR == "" {
|
||||
return nil
|
||||
}
|
||||
_, podCIDR, err := net.ParseCIDR(node.Spec.PodCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to parse CIDR %s on Node %v: %v", node.Spec.PodCIDR, node.Name, err)
|
||||
}
|
||||
|
||||
glog.V(4).Infof("release CIDR %s", node.Spec.PodCIDR)
|
||||
if err = r.cidrs.release(podCIDR); err != nil {
|
||||
return fmt.Errorf("Error when releasing CIDR %v: %v", node.Spec.PodCIDR, err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used,
|
||||
// so that they won't be assignable.
|
||||
func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) {
|
||||
// Checks if service CIDR has a nonempty intersection with cluster
|
||||
// CIDR. It is the case if either clusterCIDR contains serviceCIDR with
|
||||
// clusterCIDR's Mask applied (this means that clusterCIDR contains
|
||||
// serviceCIDR) or vice versa (which means that serviceCIDR contains
|
||||
// clusterCIDR).
|
||||
if !r.clusterCIDR.Contains(serviceCIDR.IP.Mask(r.clusterCIDR.Mask)) && !serviceCIDR.Contains(r.clusterCIDR.IP.Mask(serviceCIDR.Mask)) {
|
||||
return
|
||||
}
|
||||
|
||||
if err := r.cidrs.occupy(serviceCIDR); err != nil {
|
||||
glog.Errorf("Error filtering out service cidr %v: %v", serviceCIDR, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Assigns CIDR to Node and sends an update to the API server.
|
||||
func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error {
|
||||
var err error
|
||||
var node *v1.Node
|
||||
defer r.removeNodeFromProcessing(data.nodeName)
|
||||
for rep := 0; rep < podCIDRUpdateRetry; rep++ {
|
||||
// TODO: change it to using PATCH instead of full Node updates.
|
||||
node, err = r.client.Core().Nodes().Get(data.nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err)
|
||||
continue
|
||||
}
|
||||
if node.Spec.PodCIDR != "" {
|
||||
glog.Errorf("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR)
|
||||
if node.Spec.PodCIDR != data.cidr.String() {
|
||||
if err := r.cidrs.release(data.cidr); err != nil {
|
||||
glog.Errorf("Error when releasing CIDR %v", data.cidr.String())
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
node.Spec.PodCIDR = data.cidr.String()
|
||||
if _, err := r.client.Core().Nodes().Update(node); err != nil {
|
||||
glog.Errorf("Failed while updating Node.Spec.PodCIDR (%d retries left): %v", podCIDRUpdateRetry-rep-1, err)
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
recordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed")
|
||||
// We accept the fact that we may leek CIDRs here. This is safer than releasing
|
||||
// them in case when we don't know if request went through.
|
||||
// NodeController restart will return all falsely allocated CIDRs to the pool.
|
||||
if !apierrors.IsServerTimeout(err) {
|
||||
glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err)
|
||||
if releaseErr := r.cidrs.release(data.cidr); releaseErr != nil {
|
||||
glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
|
@ -168,7 +168,7 @@ func init() {
|
|||
ObjectMeta: metav1.ObjectMeta{Name: saRolePrefix + "node-controller"},
|
||||
Rules: []rbac.PolicyRule{
|
||||
rbac.NewRule("get", "list", "update", "delete", "patch").Groups(legacyGroup).Resources("nodes").RuleOrDie(),
|
||||
rbac.NewRule("update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(),
|
||||
rbac.NewRule("patch", "update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(),
|
||||
// used for pod eviction
|
||||
rbac.NewRule("update").Groups(legacyGroup).Resources("pods/status").RuleOrDie(),
|
||||
rbac.NewRule("list", "delete").Groups(legacyGroup).Resources("pods").RuleOrDie(),
|
||||
|
|
|
@ -561,6 +561,7 @@ items:
|
|||
resources:
|
||||
- nodes/status
|
||||
verbs:
|
||||
- patch
|
||||
- update
|
||||
- apiGroups:
|
||||
- ""
|
||||
|
|
Loading…
Reference in New Issue