Fix etcd member deletion

Turns out etcd-only nodes were never running **any** of the controllers,
so allowing multiple controllers didn't really fix things.

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
pull/6974/head
Brad Davidson 2023-02-13 20:00:52 +00:00 committed by Brad Davidson
parent 9efa0797b7
commit 0c302f4341
5 changed files with 51 additions and 160 deletions

View File

@ -144,7 +144,8 @@ func (c *Cluster) assignManagedDriver(ctx context.Context) error {
return nil
}
// setupEtcdProxy
// setupEtcdProxy periodically updates the etcd proxy with the current list of
// cluster client URLs, as retrieved from etcd.
func (c *Cluster) setupEtcdProxy(ctx context.Context, etcdProxy etcd.Proxy) {
if c.managedDB == nil {
return

View File

@ -38,11 +38,13 @@ func (e *Embedded) ETCD(ctx context.Context, args ETCDConfig, extraArgs []string
if errors.Is(err, rafthttp.ErrMemberRemoved) {
tombstoneFile := filepath.Join(args.DataDir, "tombstone")
if err := os.WriteFile(tombstoneFile, []byte{}, 0600); err != nil {
logrus.Fatalf("failed to write tombstone file to %s", tombstoneFile)
logrus.Fatalf("Failed to write tombstone file to %s: %v", tombstoneFile, err)
}
logrus.Infof("this node has been removed from the cluster please restart %s to rejoin the cluster", version.Program)
etcd.Close()
logrus.Infof("This node has been removed from the cluster - please restart %s to rejoin the cluster", version.Program)
return
}
logrus.Errorf("etcd error: %v", err)
case <-ctx.Done():
logrus.Infof("stopping etcd")
etcd.Close()

View File

@ -554,15 +554,17 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
}
// The apiserver endpoint controller needs to run on a node with a local apiserver,
// in order to successfully seed etcd with the endpoint list.
// in order to successfully seed etcd with the endpoint list. The member removal controller
// also needs to run on a non-etcd node as to avoid disruption if running on the node that
// is being removed from the cluster.
if !e.config.DisableAPIServer {
e.config.Runtime.LeaderElectedClusterControllerStarts["etcd-apiserver-endpoints"] = func(ctx context.Context) {
e.config.Runtime.LeaderElectedClusterControllerStarts[version.Program+"-etcd"] = func(ctx context.Context) {
registerEndpointsHandlers(ctx, e)
registerMemberHandlers(ctx, e)
}
}
// The etcd member-removal controllers should only run on an etcd node. Tombstone file checking
// is also unnecessary if we're not running etcd.
// Tombstone file checking is unnecessary if we're not running etcd.
if !e.config.DisableETCD {
tombstoneFile := filepath.Join(DBDir(e.config), "tombstone")
if _, err := os.Stat(tombstoneFile); err == nil {
@ -575,10 +577,6 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
if err := e.setName(false); err != nil {
return nil, err
}
e.config.Runtime.LeaderElectedClusterControllerStarts["etcd-member-removal"] = func(ctx context.Context) {
registerMemberHandlers(ctx, e)
}
}
return e.handler(handler), nil
@ -666,6 +664,8 @@ func getClientConfig(ctx context.Context, control *config.Control, endpoints ...
DialTimeout: defaultDialTimeout,
DialKeepAliveTime: defaultKeepAliveTime,
DialKeepAliveTimeout: defaultKeepAliveTimeout,
AutoSyncInterval: defaultKeepAliveTimeout,
PermitWithoutStream: true,
}
var err error
@ -2126,21 +2126,7 @@ func GetAPIServerURLsFromETCD(ctx context.Context, cfg *config.Control) ([]strin
// GetMembersClientURLs will list through the member lists in etcd and return
// back a combined list of client urls for each member in the cluster
func (e *ETCD) GetMembersClientURLs(ctx context.Context) ([]string, error) {
ctx, cancel := context.WithTimeout(ctx, testTimeout)
defer cancel()
members, err := e.client.MemberList(ctx)
if err != nil {
return nil, err
}
var memberUrls []string
for _, member := range members.Members {
for _, clientURL := range member.ClientURLs {
memberUrls = append(memberUrls, string(clientURL))
}
}
return memberUrls, nil
return e.client.Endpoints(), nil
}
// GetMembersNames will list through the member lists in etcd and return

View File

@ -1,103 +0,0 @@
package server
import (
"context"
"os"
"path/filepath"
"time"
"github.com/k3s-io/k3s/pkg/etcd"
"github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// setETCDLabelsAndAnnotations will set the etcd role label if not exists also it
// sets special annotations on the node object which are etcd node id and etcd node
// address, the function will also remove the controlplane and master role labels if
// they exist on the node
func setETCDLabelsAndAnnotations(ctx context.Context, config *Config) error {
<-config.ControlConfig.Runtime.APIServerReady
t := time.NewTicker(5 * time.Second)
defer t.Stop()
for range t.C {
controlConfig := &config.ControlConfig
sc, err := NewContext(ctx, controlConfig.Runtime.KubeConfigAdmin)
if err != nil {
logrus.Infof("Failed to set etcd role label: %v", err)
continue
}
if err := sc.Start(ctx); err != nil {
logrus.Infof("Failed to set etcd role label: %v", err)
continue
}
controlConfig.Runtime.Core = sc.Core
nodes := sc.Core.Core().V1().Node()
nodeName := os.Getenv("NODE_NAME")
if nodeName == "" {
logrus.Info("Failed to set etcd role label: node name not set")
continue
}
node, err := nodes.Get(nodeName, metav1.GetOptions{})
if err != nil {
logrus.Infof("Failed to set etcd role label: %v", err)
continue
}
if node.Labels == nil {
node.Labels = make(map[string]string)
}
// remove controlplane label if role label exists
var controlRoleLabelExists bool
if _, ok := node.Labels[MasterRoleLabelKey]; ok {
delete(node.Labels, MasterRoleLabelKey)
controlRoleLabelExists = true
}
if _, ok := node.Labels[ControlPlaneRoleLabelKey]; ok {
delete(node.Labels, ControlPlaneRoleLabelKey)
controlRoleLabelExists = true
}
if v, ok := node.Labels[ETCDRoleLabelKey]; ok && v == "true" && !controlRoleLabelExists {
break
}
node.Labels[ETCDRoleLabelKey] = "true"
// this is replacement to the etcd controller handleself function
if node.Annotations == nil {
node.Annotations = map[string]string{}
}
fileName := filepath.Join(controlConfig.DataDir, "db", "etcd", "name")
data, err := os.ReadFile(fileName)
if err != nil {
logrus.Infof("Waiting for etcd node name file to be available: %v", err)
continue
}
etcdNodeName := string(data)
node.Annotations[etcd.NodeNameAnnotation] = etcdNodeName
address, err := etcd.GetAdvertiseAddress(controlConfig.PrivateIP)
if err != nil {
logrus.Infof("Waiting for etcd node address to be available: %v", err)
continue
}
node.Annotations[etcd.NodeAddressAnnotation] = address
_, err = nodes.Update(node)
if err == nil {
logrus.Infof("Successfully set etcd role label and annotations on node %s", nodeName)
break
}
select {
case <-ctx.Done():
return ctx.Err()
}
}
return nil
}

View File

@ -78,11 +78,7 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error {
}
}
if config.ControlConfig.DisableAPIServer {
go setETCDLabelsAndAnnotations(ctx, config)
} else {
go startOnAPIServerReady(ctx, config)
}
go startOnAPIServerReady(ctx, config)
if err := printTokens(&config.ControlConfig); err != nil {
return err
@ -138,20 +134,9 @@ func runControllers(ctx context.Context, config *Config) error {
return errors.Wrap(err, "failed to start wranger controllers")
}
controlConfig.Runtime.LeaderElectedClusterControllerStarts[version.Program] = func(ctx context.Context) {
if controlConfig.DisableAPIServer {
return
}
if err := coreControllers(ctx, sc, config); err != nil {
panic(err)
}
for _, controller := range config.LeaderControllers {
if err := controller(ctx, sc); err != nil {
panic(errors.Wrapf(err, "failed to start %s leader controller", util.GetFunctionName(controller)))
}
}
if err := sc.Start(ctx); err != nil {
panic(err)
if !controlConfig.DisableAPIServer {
controlConfig.Runtime.LeaderElectedClusterControllerStarts[version.Program] = func(ctx context.Context) {
apiserverControllers(ctx, sc, config)
}
}
@ -172,6 +157,22 @@ func runControllers(ctx context.Context, config *Config) error {
return nil
}
// apiServerControllers starts the core controllers, as well as the leader-elected controllers
// that should only run on a control-plane node.
func apiserverControllers(ctx context.Context, sc *Context, config *Config) {
if err := coreControllers(ctx, sc, config); err != nil {
panic(err)
}
for _, controller := range config.LeaderControllers {
if err := controller(ctx, sc); err != nil {
panic(errors.Wrapf(err, "failed to start %s leader controller", util.GetFunctionName(controller)))
}
}
if err := sc.Start(ctx); err != nil {
panic(err)
}
}
// runOrDie is similar to leader.RunOrDie, except that it runs the callback
// immediately, without performing leader election.
func runOrDie(ctx context.Context, name string, cb leader.Callback) {
@ -184,6 +185,12 @@ func runOrDie(ctx context.Context, name string, cb leader.Callback) {
<-ctx.Done()
}
// coreControllers starts the following controllers, if they are enabled:
// * Node controller (manages nodes passwords and coredns hosts file)
// * Helm controller
// * Secrets encryption
// * Rootless ports
// These controllers should only be run on nodes with a local apiserver
func coreControllers(ctx context.Context, sc *Context, config *Config) error {
if err := node.Register(ctx,
!config.ControlConfig.Skips["coredns"],
@ -237,6 +244,9 @@ func coreControllers(ctx context.Context, sc *Context, config *Config) error {
}
func stageFiles(ctx context.Context, sc *Context, controlConfig *config.Control) error {
if controlConfig.DisableAPIServer {
return nil
}
dataDir := filepath.Join(controlConfig.DataDir, "static")
if err := static.Stage(dataDir); err != nil {
return err
@ -527,19 +537,11 @@ func setNodeLabelsAndAnnotations(ctx context.Context, nodes v1.NodeClient, confi
time.Sleep(1 * time.Second)
continue
}
// remove etcd label if etcd is disabled
var etcdRoleLabelExists bool
if config.ControlConfig.DisableETCD {
if _, ok := node.Labels[ETCDRoleLabelKey]; ok {
delete(node.Labels, ETCDRoleLabelKey)
etcdRoleLabelExists = true
}
}
if node.Labels == nil {
node.Labels = make(map[string]string)
}
v, ok := node.Labels[ControlPlaneRoleLabelKey]
if !ok || v != "true" || etcdRoleLabelExists {
if !ok || v != "true" {
node.Labels[ControlPlaneRoleLabelKey] = "true"
node.Labels[MasterRoleLabelKey] = "true"
}
@ -565,15 +567,18 @@ func setNodeLabelsAndAnnotations(ctx context.Context, nodes v1.NodeClient, confi
return nil
}
func setClusterDNSConfig(ctx context.Context, controlConfig *Config, configMap v1.ConfigMapClient) error {
func setClusterDNSConfig(ctx context.Context, config *Config, configMap v1.ConfigMapClient) error {
if config.ControlConfig.DisableAPIServer {
return nil
}
// check if configmap already exists
_, err := configMap.Get("kube-system", "cluster-dns", metav1.GetOptions{})
if err == nil {
logrus.Infof("Cluster dns configmap already exists")
return nil
}
clusterDNS := controlConfig.ControlConfig.ClusterDNS
clusterDomain := controlConfig.ControlConfig.ClusterDomain
clusterDNS := config.ControlConfig.ClusterDNS
clusterDomain := config.ControlConfig.ClusterDomain
c := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
Kind: "ConfigMap",