diff --git a/cmd/kubeadm/app/phases/upgrade/compute_test.go b/cmd/kubeadm/app/phases/upgrade/compute_test.go index 3aa67640b9..a84f0c60f4 100644 --- a/cmd/kubeadm/app/phases/upgrade/compute_test.go +++ b/cmd/kubeadm/app/phases/upgrade/compute_test.go @@ -78,7 +78,7 @@ type fakeEtcdClient struct { func (f fakeEtcdClient) ClusterAvailable() (bool, error) { return true, nil } -func (f fakeEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { +func (f fakeEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) { return true, nil } diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods.go b/cmd/kubeadm/app/phases/upgrade/staticpods.go index 7e54de2be0..bcefea2f84 100644 --- a/cmd/kubeadm/app/phases/upgrade/staticpods.go +++ b/cmd/kubeadm/app/phases/upgrade/staticpods.go @@ -174,20 +174,6 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP recoverEtcd = true } - // We currently depend on getting the Etcd mirror Pod hash from the KubeAPIServer; - // Upgrading the Etcd protocol takes down the apiserver, so we can't verify component restarts if we restart Etcd independently. - // Skip waiting for Etcd to restart and immediately move on to updating the apiserver. - if component == constants.Etcd { - waitForComponentRestart = false - } - // Normally, if an Etcd upgrade is successful, but the apiserver upgrade fails, Etcd is not rolled back. - // In the case of a TLS upgrade, the old KubeAPIServer config is incompatible with the new Etcd confg, so we rollback Etcd - // if the APIServer upgrade fails. - if component == constants.KubeAPIServer { - recoverEtcd = true - fmt.Printf("[upgrade/staticpods] The %s manifest will be restored if component %q fails to upgrade\n", constants.Etcd, component) - } - if err := renewCerts(cfg, component); err != nil { return errors.Wrapf(err, "failed to renew certificates for component %q", component) } @@ -311,14 +297,6 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa return true, errors.Wrap(err, "error creating local etcd static pod manifest file") } - // Waiter configurations for checking etcd status - // If we are upgrading TLS we need to wait for old static pod to be removed. - // This is needed because we are not able to currently verify that the static pod - // has been updated through the apiserver across an etcd TLS upgrade. - // This value is arbitrary but seems to be long enough in manual testing. - noDelay := 0 * time.Second - podRestartDelay := 30 * time.Second - retries := 10 retryInterval := 15 * time.Second @@ -328,7 +306,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa // Since upgrade component failed, the old etcd manifest has either been restored or was never touched // Now we need to check the health of etcd cluster if it is up with old manifest fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available") - if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil { + if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil { fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err) // At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest @@ -341,7 +319,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa // Now that we've rolled back the data, let's check if the cluster comes up fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available") - if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil { + if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil { fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err) // Nothing else left to try to recover etcd cluster return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir) @@ -366,7 +344,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa // Checking health state of etcd after the upgrade fmt.Println("[upgrade/etcd] Waiting for etcd to become available") - if _, err = newEtcdClient.WaitForClusterAvailable(podRestartDelay, retries, retryInterval); err != nil { + if _, err = newEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil { fmt.Printf("[upgrade/etcd] Failed to healthcheck etcd: %v\n", err) // Despite the fact that upgradeComponent was successful, there is something wrong with the etcd cluster // First step is to restore back up of datastore @@ -384,7 +362,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa // Assuming rollback of the old etcd manifest was successful, check the status of etcd cluster again fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available") - if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil { + if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil { fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err) // Nothing else left to try to recover etcd cluster return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir) diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go index 99f40cbb15..e246697bcc 100644 --- a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go +++ b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go @@ -230,7 +230,7 @@ type fakeTLSEtcdClient struct{ TLS bool } func (c fakeTLSEtcdClient) ClusterAvailable() (bool, error) { return true, nil } -func (c fakeTLSEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { +func (c fakeTLSEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) { return true, nil } @@ -261,7 +261,7 @@ type fakePodManifestEtcdClient struct{ ManifestDir, CertificatesDir string } func (c fakePodManifestEtcdClient) ClusterAvailable() (bool, error) { return true, nil } -func (c fakePodManifestEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { +func (c fakePodManifestEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) { return true, nil } diff --git a/cmd/kubeadm/app/util/etcd/etcd.go b/cmd/kubeadm/app/util/etcd/etcd.go index 5aac683e14..bc93c20597 100644 --- a/cmd/kubeadm/app/util/etcd/etcd.go +++ b/cmd/kubeadm/app/util/etcd/etcd.go @@ -43,7 +43,7 @@ type ClusterInterrogator interface { GetClusterStatus() (map[string]*clientv3.StatusResponse, error) GetClusterVersions() (map[string]string, error) GetVersion() (string, error) - WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) + WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) Sync() error AddMember(name string, peerAddrs string) ([]Member, error) } @@ -328,10 +328,8 @@ func (c Client) GetClusterStatus() (map[string]*clientv3.StatusResponse, error) return clusterStatus, nil } -// WaitForClusterAvailable returns true if all endpoints in the cluster are available after an initial delay and retry attempts, an error is returned otherwise -func (c Client) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { - fmt.Printf("[util/etcd] Waiting %v for initial delay\n", delay) - time.Sleep(delay) +// WaitForClusterAvailable returns true if all endpoints in the cluster are available after retry attempts, an error is returned otherwise +func (c Client) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) { for i := 0; i < retries; i++ { if i > 0 { fmt.Printf("[util/etcd] Waiting %v until next retry\n", retryInterval)