diff --git a/pkg/controller/nodeipam/ipam/cidr_allocator.go b/pkg/controller/nodeipam/ipam/cidr_allocator.go index b9a97938ad..4a5cee34d8 100644 --- a/pkg/controller/nodeipam/ipam/cidr_allocator.go +++ b/pkg/controller/nodeipam/ipam/cidr_allocator.go @@ -68,7 +68,7 @@ const ( cidrUpdateQueueSize = 5000 // cidrUpdateRetries is the no. of times a NodeSpec update will be retried before dropping it. - cidrUpdateRetries = 10 + cidrUpdateRetries = 3 ) // CIDRAllocator is an interface implemented by things that know how diff --git a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go index 7a07409c7c..8d6ef878da 100644 --- a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go +++ b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go @@ -210,35 +210,34 @@ func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { } podCIDR := cidr.String() - for rep := 0; rep < cidrUpdateRetries; rep++ { - node, err = ca.nodeLister.Get(nodeName) - if err != nil { - glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", nodeName, err) - continue - } + node, err = ca.nodeLister.Get(nodeName) + if err != nil { + glog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", nodeName, err) + return err + } + + if node.Spec.PodCIDR == podCIDR { + glog.V(4).Infof("Node %v already has allocated CIDR %v. It matches the proposed one.", node.Name, podCIDR) + // We don't return here, in order to set the NetworkUnavailable condition later below. + } else { if node.Spec.PodCIDR != "" { - if node.Spec.PodCIDR == podCIDR { - glog.V(4).Infof("Node %v already has allocated CIDR %v. It matches the proposed one.", node.Name, podCIDR) - // We don't return to set the NetworkUnavailable condition if needed. - break - } - glog.Errorf("PodCIDR being reassigned! Node %v spec has %v, but cloud provider has assigned %v", - node.Name, node.Spec.PodCIDR, podCIDR) + glog.Errorf("PodCIDR being reassigned! Node %v spec has %v, but cloud provider has assigned %v", node.Name, node.Spec.PodCIDR, podCIDR) // We fall through and set the CIDR despite this error. This // implements the same logic as implemented in the // rangeAllocator. // // See https://github.com/kubernetes/kubernetes/pull/42147#discussion_r103357248 } - if err = utilnode.PatchNodeCIDR(ca.client, types.NodeName(node.Name), podCIDR); err == nil { - glog.Infof("Set node %v PodCIDR to %v", node.Name, podCIDR) - break + for i := 0; i < cidrUpdateRetries; i++ { + if err = utilnode.PatchNodeCIDR(ca.client, types.NodeName(node.Name), podCIDR); err == nil { + glog.Infof("Set node %v PodCIDR to %v", node.Name, podCIDR) + break + } } - glog.Errorf("Failed to update node %v PodCIDR to %v (%d retries left): %v", node.Name, podCIDR, cidrUpdateRetries-rep-1, err) } if err != nil { nodeutil.RecordNodeStatusChange(ca.recorder, node, "CIDRAssignmentFailed") - glog.Errorf("CIDR assignment for node %v failed: %v.", nodeName, err) + glog.Errorf("Failed to update node %v PodCIDR to %v after multiple attempts: %v", node.Name, podCIDR, err) return err } diff --git a/pkg/controller/nodeipam/ipam/range_allocator.go b/pkg/controller/nodeipam/ipam/range_allocator.go index 5de2195854..241bc2f1b8 100644 --- a/pkg/controller/nodeipam/ipam/range_allocator.go +++ b/pkg/controller/nodeipam/ipam/range_allocator.go @@ -286,39 +286,40 @@ func (r *rangeAllocator) updateCIDRAllocation(data nodeAndCIDR) error { defer r.removeNodeFromProcessing(data.nodeName) podCIDR := data.cidr.String() - for rep := 0; rep < cidrUpdateRetries; rep++ { - node, err = r.nodeLister.Get(data.nodeName) - if err != nil { - glog.Errorf("Failed while getting node %v to retry updating Node.Spec.PodCIDR: %v", data.nodeName, err) - continue - } - if node.Spec.PodCIDR != "" { - glog.V(4).Infof("Node %v already has allocated CIDR %v. Releasing assigned one if different.", node.Name, node.Spec.PodCIDR) - if node.Spec.PodCIDR != podCIDR { - glog.Errorf("Node %q PodCIDR seems to have changed (original=%v, current=%v), releasing original and occupying new CIDR", - node.Name, node.Spec.PodCIDR, podCIDR) - if err := r.cidrs.Release(data.cidr); err != nil { - glog.Errorf("Error when releasing CIDR %v", podCIDR) - } - } - return nil + + node, err = r.nodeLister.Get(data.nodeName) + if err != nil { + glog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDR: %v", data.nodeName, err) + return err + } + + if node.Spec.PodCIDR == podCIDR { + glog.V(4).Infof("Node %v already has allocated CIDR %v. It matches the proposed one.", node.Name, podCIDR) + return nil + } + if node.Spec.PodCIDR != "" { + glog.Errorf("Node %v already has a CIDR allocated %v. Releasing the new one %v.", node.Name, node.Spec.PodCIDR, podCIDR) + if err := r.cidrs.Release(data.cidr); err != nil { + glog.Errorf("Error when releasing CIDR %v", podCIDR) } + return nil + } + // If we reached here, it means that the node has no CIDR currently assigned. So we set it. + for i := 0; i < cidrUpdateRetries; i++ { if err = utilnode.PatchNodeCIDR(r.client, types.NodeName(node.Name), podCIDR); err == nil { glog.Infof("Set node %v PodCIDR to %v", node.Name, podCIDR) - break + return nil } - glog.Errorf("Failed to update node %v PodCIDR to %v (%d retries left): %v", node.Name, podCIDR, cidrUpdateRetries-rep-1, err) } - if err != nil { - nodeutil.RecordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") - // We accept the fact that we may leek CIDRs here. This is safer than releasing - // them in case when we don't know if request went through. - // NodeController restart will return all falsely allocated CIDRs to the pool. - if !apierrors.IsServerTimeout(err) { - glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", data.nodeName, err) - if releaseErr := r.cidrs.Release(data.cidr); releaseErr != nil { - glog.Errorf("Error releasing allocated CIDR for node %v: %v", data.nodeName, releaseErr) - } + glog.Errorf("Failed to update node %v PodCIDR to %v after multiple attempts: %v", node.Name, podCIDR, err) + nodeutil.RecordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") + // We accept the fact that we may leak CIDRs here. This is safer than releasing + // them in case when we don't know if request went through. + // NodeController restart will return all falsely allocated CIDRs to the pool. + if !apierrors.IsServerTimeout(err) { + glog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", node.Name, err) + if releaseErr := r.cidrs.Release(data.cidr); releaseErr != nil { + glog.Errorf("Error releasing allocated CIDR for node %v: %v", node.Name, releaseErr) } } return err