mirror of https://github.com/k3s-io/k3s
Merge pull request #71727 from mikedanese/fixcrm
pkg/kubelet/cloudresource: fallback to old addresses if sync loop failspull/564/head
commit
ae2b176439
|
@ -21,6 +21,7 @@ go_test(
|
|||
deps = [
|
||||
"//pkg/cloudprovider/providers/fake:go_default_library",
|
||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
|
||||
],
|
||||
)
|
||||
|
||||
|
|
|
@ -30,8 +30,6 @@ import (
|
|||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
var nodeAddressesRetryPeriod = 5 * time.Second
|
||||
|
||||
// SyncManager is an interface for making requests to a cloud provider
|
||||
type SyncManager interface {
|
||||
Run(stopCh <-chan struct{})
|
||||
|
@ -46,79 +44,92 @@ type cloudResourceSyncManager struct {
|
|||
// Sync period
|
||||
syncPeriod time.Duration
|
||||
|
||||
nodeAddressesMux sync.Mutex
|
||||
nodeAddressesErr error
|
||||
nodeAddresses []v1.NodeAddress
|
||||
nodeAddressesMonitor *sync.Cond
|
||||
nodeAddressesErr error
|
||||
nodeAddresses []v1.NodeAddress
|
||||
|
||||
nodeName types.NodeName
|
||||
}
|
||||
|
||||
// NewSyncManager creates a manager responsible for collecting resources
|
||||
// from a cloud provider through requests that are sensitive to timeouts and hanging
|
||||
// NewSyncManager creates a manager responsible for collecting resources from a
|
||||
// cloud provider through requests that are sensitive to timeouts and hanging
|
||||
func NewSyncManager(cloud cloudprovider.Interface, nodeName types.NodeName, syncPeriod time.Duration) SyncManager {
|
||||
return &cloudResourceSyncManager{
|
||||
cloud: cloud,
|
||||
syncPeriod: syncPeriod,
|
||||
nodeName: nodeName,
|
||||
// nodeAddressesMonitor is a monitor that guards a result (nodeAddresses,
|
||||
// nodeAddressesErr) of the sync loop under the condition that a result has
|
||||
// been saved at least once. The semantics here are:
|
||||
//
|
||||
// * Readers of the result will wait on the monitor until the first result
|
||||
// has been saved.
|
||||
// * The sync loop (i.e. the only writer), will signal all waiters every
|
||||
// time it updates the result.
|
||||
nodeAddressesMonitor: sync.NewCond(&sync.Mutex{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (manager *cloudResourceSyncManager) getNodeAddressSafe() ([]v1.NodeAddress, error) {
|
||||
manager.nodeAddressesMux.Lock()
|
||||
defer manager.nodeAddressesMux.Unlock()
|
||||
|
||||
return manager.nodeAddresses, manager.nodeAddressesErr
|
||||
}
|
||||
|
||||
func (manager *cloudResourceSyncManager) setNodeAddressSafe(nodeAddresses []v1.NodeAddress, err error) {
|
||||
manager.nodeAddressesMux.Lock()
|
||||
defer manager.nodeAddressesMux.Unlock()
|
||||
|
||||
manager.nodeAddresses = nodeAddresses
|
||||
manager.nodeAddressesErr = err
|
||||
}
|
||||
|
||||
// NodeAddresses does not wait for cloud provider to return a node addresses.
|
||||
// It always returns node addresses or an error.
|
||||
func (manager *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) {
|
||||
// NodeAddresses waits for the first sync loop to run. If no successful syncs
|
||||
// have run, it will return the most recent error. If node addresses have been
|
||||
// synced successfully, it will return the list of node addresses from the most
|
||||
// recent successful sync.
|
||||
func (m *cloudResourceSyncManager) NodeAddresses() ([]v1.NodeAddress, error) {
|
||||
m.nodeAddressesMonitor.L.Lock()
|
||||
defer m.nodeAddressesMonitor.L.Unlock()
|
||||
// wait until there is something
|
||||
for {
|
||||
nodeAddresses, err := manager.getNodeAddressSafe()
|
||||
if len(nodeAddresses) == 0 && err == nil {
|
||||
klog.V(5).Infof("Waiting for %v for cloud provider to provide node addresses", nodeAddressesRetryPeriod)
|
||||
time.Sleep(nodeAddressesRetryPeriod)
|
||||
continue
|
||||
if addrs, err := m.nodeAddresses, m.nodeAddressesErr; len(addrs) > 0 || err != nil {
|
||||
return addrs, err
|
||||
}
|
||||
return nodeAddresses, err
|
||||
klog.V(5).Infof("Waiting for cloud provider to provide node addresses")
|
||||
m.nodeAddressesMonitor.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
func (manager *cloudResourceSyncManager) collectNodeAddresses(ctx context.Context, nodeName types.NodeName) {
|
||||
klog.V(5).Infof("Requesting node addresses from cloud provider for node %q", nodeName)
|
||||
|
||||
instances, ok := manager.cloud.Instances()
|
||||
// getNodeAddresses calls the cloud provider to get a current list of node addresses.
|
||||
func (m *cloudResourceSyncManager) getNodeAddresses() ([]v1.NodeAddress, error) {
|
||||
// TODO(roberthbailey): Can we do this without having credentials to talk to
|
||||
// the cloud provider?
|
||||
// TODO(justinsb): We can if CurrentNodeName() was actually CurrentNode() and
|
||||
// returned an interface.
|
||||
// TODO: If IP addresses couldn't be fetched from the cloud provider, should
|
||||
// kubelet fallback on the other methods for getting the IP below?
|
||||
instances, ok := m.cloud.Instances()
|
||||
if !ok {
|
||||
manager.setNodeAddressSafe(nil, fmt.Errorf("failed to get instances from cloud provider"))
|
||||
return nil, fmt.Errorf("failed to get instances from cloud provider")
|
||||
}
|
||||
return instances.NodeAddresses(context.TODO(), m.nodeName)
|
||||
}
|
||||
|
||||
func (m *cloudResourceSyncManager) syncNodeAddresses() {
|
||||
klog.V(5).Infof("Requesting node addresses from cloud provider for node %q", m.nodeName)
|
||||
|
||||
addrs, err := m.getNodeAddresses()
|
||||
|
||||
m.nodeAddressesMonitor.L.Lock()
|
||||
defer m.nodeAddressesMonitor.L.Unlock()
|
||||
defer m.nodeAddressesMonitor.Broadcast()
|
||||
|
||||
if err != nil {
|
||||
klog.V(2).Infof("Node addresses from cloud provider for node %q not collected: %v", m.nodeName, err)
|
||||
|
||||
if len(m.nodeAddresses) > 0 {
|
||||
// in the event that a sync loop fails when a previous sync had
|
||||
// succeeded, continue to use the old addresses.
|
||||
return
|
||||
}
|
||||
|
||||
m.nodeAddressesErr = fmt.Errorf("failed to get node address from cloud provider: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// TODO(roberthbailey): Can we do this without having credentials to talk
|
||||
// to the cloud provider?
|
||||
// TODO(justinsb): We can if CurrentNodeName() was actually CurrentNode() and returned an interface
|
||||
// TODO: If IP addresses couldn't be fetched from the cloud provider, should kubelet fallback on the other methods for getting the IP below?
|
||||
|
||||
nodeAddresses, err := instances.NodeAddresses(ctx, nodeName)
|
||||
if err != nil {
|
||||
manager.setNodeAddressSafe(nil, fmt.Errorf("failed to get node address from cloud provider: %v", err))
|
||||
klog.V(2).Infof("Node addresses from cloud provider for node %q not collected", nodeName)
|
||||
} else {
|
||||
manager.setNodeAddressSafe(nodeAddresses, nil)
|
||||
klog.V(5).Infof("Node addresses from cloud provider for node %q collected", nodeName)
|
||||
}
|
||||
klog.V(5).Infof("Node addresses from cloud provider for node %q collected", m.nodeName)
|
||||
m.nodeAddressesErr = nil
|
||||
m.nodeAddresses = addrs
|
||||
}
|
||||
|
||||
func (manager *cloudResourceSyncManager) Run(stopCh <-chan struct{}) {
|
||||
wait.Until(func() {
|
||||
manager.collectNodeAddresses(context.TODO(), manager.nodeName)
|
||||
}, manager.syncPeriod, stopCh)
|
||||
// Run starts the cloud resource sync manager's sync loop.
|
||||
func (m *cloudResourceSyncManager) Run(stopCh <-chan struct{}) {
|
||||
wait.Until(m.syncNodeAddresses, m.syncPeriod, stopCh)
|
||||
}
|
||||
|
|
|
@ -17,33 +17,16 @@ limitations under the License.
|
|||
package cloudresource
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"errors"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/diff"
|
||||
"k8s.io/kubernetes/pkg/cloudprovider/providers/fake"
|
||||
)
|
||||
|
||||
func collectNodeAddresses(manager *cloudResourceSyncManager) ([]v1.NodeAddress, error) {
|
||||
var nodeAddresses []v1.NodeAddress
|
||||
var err error
|
||||
|
||||
collected := make(chan struct{}, 1)
|
||||
go func() {
|
||||
nodeAddresses, err = manager.NodeAddresses()
|
||||
close(collected)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-collected:
|
||||
return nodeAddresses, err
|
||||
case <-time.Tick(2 * nodeAddressesRetryPeriod):
|
||||
return nil, fmt.Errorf("Timeout after %v waiting for address to appear", 2*nodeAddressesRetryPeriod)
|
||||
}
|
||||
}
|
||||
|
||||
func createNodeInternalIPAddress(address string) []v1.NodeAddress {
|
||||
return []v1.NodeAddress{
|
||||
{
|
||||
|
@ -53,13 +36,12 @@ func createNodeInternalIPAddress(address string) []v1.NodeAddress {
|
|||
}
|
||||
}
|
||||
|
||||
func TestNodeAddressesRequest(t *testing.T) {
|
||||
syncPeriod := 300 * time.Millisecond
|
||||
maxRetry := 5
|
||||
func TestNodeAddressesDelay(t *testing.T) {
|
||||
syncPeriod := 100 * time.Millisecond
|
||||
cloud := &fake.FakeCloud{
|
||||
Addresses: createNodeInternalIPAddress("10.0.1.12"),
|
||||
// Set the request delay so the manager timeouts and collects the node addresses later
|
||||
RequestDelay: 400 * time.Millisecond,
|
||||
RequestDelay: 200 * time.Millisecond,
|
||||
}
|
||||
stopCh := make(chan struct{})
|
||||
defer close(stopCh)
|
||||
|
@ -67,27 +49,28 @@ func TestNodeAddressesRequest(t *testing.T) {
|
|||
manager := NewSyncManager(cloud, "defaultNode", syncPeriod).(*cloudResourceSyncManager)
|
||||
go manager.Run(stopCh)
|
||||
|
||||
nodeAddresses, err := collectNodeAddresses(manager)
|
||||
nodeAddresses, err := manager.NodeAddresses()
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected err: %q\n", err)
|
||||
}
|
||||
if !reflect.DeepEqual(nodeAddresses, cloud.Addresses) {
|
||||
t.Errorf("Unexpected list of node addresses %#v, expected %#v: %v", nodeAddresses, cloud.Addresses, err)
|
||||
t.Errorf("Unexpected diff of node addresses: %v", diff.ObjectReflectDiff(nodeAddresses, cloud.Addresses))
|
||||
}
|
||||
|
||||
// Change the IP address
|
||||
cloud.SetNodeAddresses(createNodeInternalIPAddress("10.0.1.13"))
|
||||
|
||||
// Wait until the IP address changes
|
||||
maxRetry := 5
|
||||
for i := 0; i < maxRetry; i++ {
|
||||
nodeAddresses, err := collectNodeAddresses(manager)
|
||||
nodeAddresses, err := manager.NodeAddresses()
|
||||
t.Logf("nodeAddresses: %#v, err: %v", nodeAddresses, err)
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected err: %q\n", err)
|
||||
}
|
||||
// It is safe to read cloud.Addresses since no routine is changing the value at the same time
|
||||
if err == nil && nodeAddresses[0].Address != cloud.Addresses[0].Address {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
time.Sleep(syncPeriod)
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
|
@ -97,3 +80,54 @@ func TestNodeAddressesRequest(t *testing.T) {
|
|||
}
|
||||
t.Errorf("Timeout waiting for %q address to appear", cloud.Addresses[0].Address)
|
||||
}
|
||||
|
||||
func TestNodeAddressesUsesLastSuccess(t *testing.T) {
|
||||
cloud := &fake.FakeCloud{}
|
||||
manager := NewSyncManager(cloud, "defaultNode", 0).(*cloudResourceSyncManager)
|
||||
|
||||
// These tests are stateful and order dependant.
|
||||
tests := []struct {
|
||||
name string
|
||||
addrs []v1.NodeAddress
|
||||
err error
|
||||
wantAddrs []v1.NodeAddress
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "first sync loop encounters an error",
|
||||
err: errors.New("bad"),
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "subsequent sync loop succeeds",
|
||||
addrs: createNodeInternalIPAddress("10.0.1.12"),
|
||||
wantAddrs: createNodeInternalIPAddress("10.0.1.12"),
|
||||
},
|
||||
{
|
||||
name: "subsequent sync loop encounters an error, last addresses returned",
|
||||
err: errors.New("bad"),
|
||||
wantAddrs: createNodeInternalIPAddress("10.0.1.12"),
|
||||
},
|
||||
{
|
||||
name: "subsequent sync loop succeeds changing addresses",
|
||||
addrs: createNodeInternalIPAddress("10.0.1.13"),
|
||||
wantAddrs: createNodeInternalIPAddress("10.0.1.13"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
cloud.Addresses = test.addrs
|
||||
cloud.Err = test.err
|
||||
|
||||
manager.syncNodeAddresses()
|
||||
nodeAddresses, err := manager.NodeAddresses()
|
||||
if (err != nil) != test.wantErr {
|
||||
t.Errorf("unexpected err: %v", err)
|
||||
}
|
||||
if got, want := nodeAddresses, test.wantAddrs; !reflect.DeepEqual(got, want) {
|
||||
t.Errorf("Unexpected diff of node addresses: %v", diff.ObjectReflectDiff(got, want))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue