Merge pull request #71828 from yuexiao-wang/cleanup-upgrad-etcd-left

kubeadm: fixed cleanup upgrade from no-TLS etcd to TLS etcd
pull/564/head
Kubernetes Prow Robot 2018-12-08 02:39:54 -08:00 committed by GitHub
commit c06c08e5ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 10 additions and 34 deletions

View File

@ -78,7 +78,7 @@ type fakeEtcdClient struct {
func (f fakeEtcdClient) ClusterAvailable() (bool, error) { return true, nil } func (f fakeEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
func (f fakeEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { func (f fakeEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
return true, nil return true, nil
} }

View File

@ -174,20 +174,6 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
recoverEtcd = true recoverEtcd = true
} }
// We currently depend on getting the Etcd mirror Pod hash from the KubeAPIServer;
// Upgrading the Etcd protocol takes down the apiserver, so we can't verify component restarts if we restart Etcd independently.
// Skip waiting for Etcd to restart and immediately move on to updating the apiserver.
if component == constants.Etcd {
waitForComponentRestart = false
}
// Normally, if an Etcd upgrade is successful, but the apiserver upgrade fails, Etcd is not rolled back.
// In the case of a TLS upgrade, the old KubeAPIServer config is incompatible with the new Etcd confg, so we rollback Etcd
// if the APIServer upgrade fails.
if component == constants.KubeAPIServer {
recoverEtcd = true
fmt.Printf("[upgrade/staticpods] The %s manifest will be restored if component %q fails to upgrade\n", constants.Etcd, component)
}
if err := renewCerts(cfg, component); err != nil { if err := renewCerts(cfg, component); err != nil {
return errors.Wrapf(err, "failed to renew certificates for component %q", component) return errors.Wrapf(err, "failed to renew certificates for component %q", component)
} }
@ -311,14 +297,6 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
return true, errors.Wrap(err, "error creating local etcd static pod manifest file") return true, errors.Wrap(err, "error creating local etcd static pod manifest file")
} }
// Waiter configurations for checking etcd status
// If we are upgrading TLS we need to wait for old static pod to be removed.
// This is needed because we are not able to currently verify that the static pod
// has been updated through the apiserver across an etcd TLS upgrade.
// This value is arbitrary but seems to be long enough in manual testing.
noDelay := 0 * time.Second
podRestartDelay := 30 * time.Second
retries := 10 retries := 10
retryInterval := 15 * time.Second retryInterval := 15 * time.Second
@ -328,7 +306,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
// Since upgrade component failed, the old etcd manifest has either been restored or was never touched // Since upgrade component failed, the old etcd manifest has either been restored or was never touched
// Now we need to check the health of etcd cluster if it is up with old manifest // Now we need to check the health of etcd cluster if it is up with old manifest
fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available") fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil { if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err) fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
// At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest // At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest
@ -341,7 +319,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
// Now that we've rolled back the data, let's check if the cluster comes up // Now that we've rolled back the data, let's check if the cluster comes up
fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available") fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil { if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err) fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
// Nothing else left to try to recover etcd cluster // Nothing else left to try to recover etcd cluster
return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir) return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir)
@ -366,7 +344,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
// Checking health state of etcd after the upgrade // Checking health state of etcd after the upgrade
fmt.Println("[upgrade/etcd] Waiting for etcd to become available") fmt.Println("[upgrade/etcd] Waiting for etcd to become available")
if _, err = newEtcdClient.WaitForClusterAvailable(podRestartDelay, retries, retryInterval); err != nil { if _, err = newEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
fmt.Printf("[upgrade/etcd] Failed to healthcheck etcd: %v\n", err) fmt.Printf("[upgrade/etcd] Failed to healthcheck etcd: %v\n", err)
// Despite the fact that upgradeComponent was successful, there is something wrong with the etcd cluster // Despite the fact that upgradeComponent was successful, there is something wrong with the etcd cluster
// First step is to restore back up of datastore // First step is to restore back up of datastore
@ -384,7 +362,7 @@ func performEtcdStaticPodUpgrade(client clientset.Interface, waiter apiclient.Wa
// Assuming rollback of the old etcd manifest was successful, check the status of etcd cluster again // Assuming rollback of the old etcd manifest was successful, check the status of etcd cluster again
fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available") fmt.Println("[upgrade/etcd] Waiting for previous etcd to become available")
if _, err := oldEtcdClient.WaitForClusterAvailable(noDelay, retries, retryInterval); err != nil { if _, err := oldEtcdClient.WaitForClusterAvailable(retries, retryInterval); err != nil {
fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err) fmt.Printf("[upgrade/etcd] Failed to healthcheck previous etcd: %v\n", err)
// Nothing else left to try to recover etcd cluster // Nothing else left to try to recover etcd cluster
return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir) return true, errors.Wrapf(err, "fatal error rolling back local etcd cluster manifest, the backup of etcd database is stored here:(%s)", backupEtcdDir)

View File

@ -230,7 +230,7 @@ type fakeTLSEtcdClient struct{ TLS bool }
func (c fakeTLSEtcdClient) ClusterAvailable() (bool, error) { return true, nil } func (c fakeTLSEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
func (c fakeTLSEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { func (c fakeTLSEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
return true, nil return true, nil
} }
@ -261,7 +261,7 @@ type fakePodManifestEtcdClient struct{ ManifestDir, CertificatesDir string }
func (c fakePodManifestEtcdClient) ClusterAvailable() (bool, error) { return true, nil } func (c fakePodManifestEtcdClient) ClusterAvailable() (bool, error) { return true, nil }
func (c fakePodManifestEtcdClient) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { func (c fakePodManifestEtcdClient) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
return true, nil return true, nil
} }

View File

@ -43,7 +43,7 @@ type ClusterInterrogator interface {
GetClusterStatus() (map[string]*clientv3.StatusResponse, error) GetClusterStatus() (map[string]*clientv3.StatusResponse, error)
GetClusterVersions() (map[string]string, error) GetClusterVersions() (map[string]string, error)
GetVersion() (string, error) GetVersion() (string, error)
WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error)
Sync() error Sync() error
AddMember(name string, peerAddrs string) ([]Member, error) AddMember(name string, peerAddrs string) ([]Member, error)
} }
@ -328,10 +328,8 @@ func (c Client) GetClusterStatus() (map[string]*clientv3.StatusResponse, error)
return clusterStatus, nil return clusterStatus, nil
} }
// WaitForClusterAvailable returns true if all endpoints in the cluster are available after an initial delay and retry attempts, an error is returned otherwise // WaitForClusterAvailable returns true if all endpoints in the cluster are available after retry attempts, an error is returned otherwise
func (c Client) WaitForClusterAvailable(delay time.Duration, retries int, retryInterval time.Duration) (bool, error) { func (c Client) WaitForClusterAvailable(retries int, retryInterval time.Duration) (bool, error) {
fmt.Printf("[util/etcd] Waiting %v for initial delay\n", delay)
time.Sleep(delay)
for i := 0; i < retries; i++ { for i := 0; i < retries; i++ {
if i > 0 { if i > 0 {
fmt.Printf("[util/etcd] Waiting %v until next retry\n", retryInterval) fmt.Printf("[util/etcd] Waiting %v until next retry\n", retryInterval)