From 4c0c7dd8793aa5b0fde172f5e441f26e26a40442 Mon Sep 17 00:00:00 2001 From: Filip Grzadkowski Date: Tue, 16 Jun 2015 16:34:12 +0200 Subject: [PATCH] Fix intializing IP/Port allocators when etcd is not reachable. --- pkg/master/controller.go | 6 ++++-- pkg/registry/service/allocator/etcd/etcd_test.go | 4 ++-- .../service/ipallocator/controller/repair.go | 12 +++++++++++- pkg/registry/service/ipallocator/etcd/etcd_test.go | 4 ++-- .../service/portallocator/controller/repair.go | 12 +++++++++++- 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pkg/master/controller.go b/pkg/master/controller.go index a8558750cf..60d60721c9 100644 --- a/pkg/master/controller.go +++ b/pkg/master/controller.go @@ -76,10 +76,12 @@ func (c *Controller) Start() { // run all of the controllers once prior to returning from Start. if err := repairClusterIPs.RunOnce(); err != nil { - glog.Errorf("Unable to perform initial IP allocation check: %v", err) + // If we fail to repair cluster IPs apiserver is useless. We should restart and retry. + glog.Fatalf("Unable to perform initial IP allocation check: %v", err) } if err := repairNodePorts.RunOnce(); err != nil { - glog.Errorf("Unable to perform initial service nodePort check: %v", err) + // If we fail to repair node ports apiserver is useless. We should restart and retry. + glog.Fatalf("Unable to perform initial service nodePort check: %v", err) } if err := c.UpdateKubernetesService(); err != nil { glog.Errorf("Unable to perform initial Kubernetes service initialization: %v", err) diff --git a/pkg/registry/service/allocator/etcd/etcd_test.go b/pkg/registry/service/allocator/etcd/etcd_test.go index 03612f647c..b0f0e33f14 100644 --- a/pkg/registry/service/allocator/etcd/etcd_test.go +++ b/pkg/registry/service/allocator/etcd/etcd_test.go @@ -17,11 +17,11 @@ limitations under the License. package etcd import ( + "strings" "testing" "github.com/coreos/go-etcd/etcd" - "fmt" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi" "github.com/GoogleCloudPlatform/kubernetes/pkg/registry/service/allocator" @@ -54,7 +54,7 @@ func key() string { func TestEmpty(t *testing.T) { storage, _, ecli := newStorage(t) ecli.ExpectNotFoundGet(key()) - if _, err := storage.Allocate(1); fmt.Sprintf("%v", err) != "cannot allocate resources of type serviceipallocation at this time" { + if _, err := storage.Allocate(1); !strings.Contains(err.Error(), "cannot allocate resources of type serviceipallocation at this time") { t.Fatal(err) } } diff --git a/pkg/registry/service/ipallocator/controller/repair.go b/pkg/registry/service/ipallocator/controller/repair.go index 2c4ce347e3..d3cfc1f821 100644 --- a/pkg/registry/service/ipallocator/controller/repair.go +++ b/pkg/registry/service/ipallocator/controller/repair.go @@ -78,7 +78,17 @@ func (c *Repair) RunOnce() error { // and the release code must not release services that have had IPs allocated but not yet been created // See #8295 - latest, err := c.alloc.Get() + // If etcd server is not running we should wait for some time and fail only then. This is particularly + // important when we start apiserver and etcd at the same time. + var latest *api.RangeAllocation + var err error + for i := 0; i < 10; i++ { + if latest, err = c.alloc.Get(); err != nil { + time.Sleep(time.Second) + } else { + break + } + } if err != nil { return fmt.Errorf("unable to refresh the service IP block: %v", err) } diff --git a/pkg/registry/service/ipallocator/etcd/etcd_test.go b/pkg/registry/service/ipallocator/etcd/etcd_test.go index 7c9b1427bc..baed5074e4 100644 --- a/pkg/registry/service/ipallocator/etcd/etcd_test.go +++ b/pkg/registry/service/ipallocator/etcd/etcd_test.go @@ -18,11 +18,11 @@ package etcd import ( "net" + "strings" "testing" "github.com/coreos/go-etcd/etcd" - "fmt" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api/testapi" "github.com/GoogleCloudPlatform/kubernetes/pkg/registry/service/allocator" @@ -66,7 +66,7 @@ func key() string { func TestEmpty(t *testing.T) { storage, _, ecli := newStorage(t) ecli.ExpectNotFoundGet(key()) - if err := storage.Allocate(net.ParseIP("192.168.1.2")); fmt.Sprintf("%v", err) != "cannot allocate resources of type serviceipallocation at this time" { + if err := storage.Allocate(net.ParseIP("192.168.1.2")); !strings.Contains(err.Error(), "cannot allocate resources of type serviceipallocation at this time") { t.Fatal(err) } } diff --git a/pkg/registry/service/portallocator/controller/repair.go b/pkg/registry/service/portallocator/controller/repair.go index 5b8753b6d1..c2c52da686 100644 --- a/pkg/registry/service/portallocator/controller/repair.go +++ b/pkg/registry/service/portallocator/controller/repair.go @@ -63,7 +63,17 @@ func (c *Repair) RunOnce() error { // and the release code must not release services that have had ports allocated but not yet been created // See #8295 - latest, err := c.alloc.Get() + // If etcd server is not running we should wait for some time and fail only then. This is particularly + // important when we start apiserver and etcd at the same time. + var latest *api.RangeAllocation + var err error + for i := 0; i < 10; i++ { + if latest, err = c.alloc.Get(); err != nil { + time.Sleep(time.Second) + } else { + break + } + } if err != nil { return fmt.Errorf("unable to refresh the port block: %v", err) }