mirror of https://github.com/k3s-io/k3s
Fix etcd member deletion
Turns out etcd-only nodes were never running **any** of the controllers, so allowing multiple controllers didn't really fix things. Signed-off-by: Brad Davidson <brad.davidson@rancher.com>pull/6974/head
parent
9efa0797b7
commit
0c302f4341
|
@ -144,7 +144,8 @@ func (c *Cluster) assignManagedDriver(ctx context.Context) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// setupEtcdProxy
|
||||
// setupEtcdProxy periodically updates the etcd proxy with the current list of
|
||||
// cluster client URLs, as retrieved from etcd.
|
||||
func (c *Cluster) setupEtcdProxy(ctx context.Context, etcdProxy etcd.Proxy) {
|
||||
if c.managedDB == nil {
|
||||
return
|
||||
|
|
|
@ -38,11 +38,13 @@ func (e *Embedded) ETCD(ctx context.Context, args ETCDConfig, extraArgs []string
|
|||
if errors.Is(err, rafthttp.ErrMemberRemoved) {
|
||||
tombstoneFile := filepath.Join(args.DataDir, "tombstone")
|
||||
if err := os.WriteFile(tombstoneFile, []byte{}, 0600); err != nil {
|
||||
logrus.Fatalf("failed to write tombstone file to %s", tombstoneFile)
|
||||
logrus.Fatalf("Failed to write tombstone file to %s: %v", tombstoneFile, err)
|
||||
}
|
||||
logrus.Infof("this node has been removed from the cluster please restart %s to rejoin the cluster", version.Program)
|
||||
etcd.Close()
|
||||
logrus.Infof("This node has been removed from the cluster - please restart %s to rejoin the cluster", version.Program)
|
||||
return
|
||||
}
|
||||
logrus.Errorf("etcd error: %v", err)
|
||||
case <-ctx.Done():
|
||||
logrus.Infof("stopping etcd")
|
||||
etcd.Close()
|
||||
|
|
|
@ -554,15 +554,17 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
|
|||
}
|
||||
|
||||
// The apiserver endpoint controller needs to run on a node with a local apiserver,
|
||||
// in order to successfully seed etcd with the endpoint list.
|
||||
// in order to successfully seed etcd with the endpoint list. The member removal controller
|
||||
// also needs to run on a non-etcd node as to avoid disruption if running on the node that
|
||||
// is being removed from the cluster.
|
||||
if !e.config.DisableAPIServer {
|
||||
e.config.Runtime.LeaderElectedClusterControllerStarts["etcd-apiserver-endpoints"] = func(ctx context.Context) {
|
||||
e.config.Runtime.LeaderElectedClusterControllerStarts[version.Program+"-etcd"] = func(ctx context.Context) {
|
||||
registerEndpointsHandlers(ctx, e)
|
||||
registerMemberHandlers(ctx, e)
|
||||
}
|
||||
}
|
||||
|
||||
// The etcd member-removal controllers should only run on an etcd node. Tombstone file checking
|
||||
// is also unnecessary if we're not running etcd.
|
||||
// Tombstone file checking is unnecessary if we're not running etcd.
|
||||
if !e.config.DisableETCD {
|
||||
tombstoneFile := filepath.Join(DBDir(e.config), "tombstone")
|
||||
if _, err := os.Stat(tombstoneFile); err == nil {
|
||||
|
@ -575,10 +577,6 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
|
|||
if err := e.setName(false); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
e.config.Runtime.LeaderElectedClusterControllerStarts["etcd-member-removal"] = func(ctx context.Context) {
|
||||
registerMemberHandlers(ctx, e)
|
||||
}
|
||||
}
|
||||
|
||||
return e.handler(handler), nil
|
||||
|
@ -666,6 +664,8 @@ func getClientConfig(ctx context.Context, control *config.Control, endpoints ...
|
|||
DialTimeout: defaultDialTimeout,
|
||||
DialKeepAliveTime: defaultKeepAliveTime,
|
||||
DialKeepAliveTimeout: defaultKeepAliveTimeout,
|
||||
AutoSyncInterval: defaultKeepAliveTimeout,
|
||||
PermitWithoutStream: true,
|
||||
}
|
||||
|
||||
var err error
|
||||
|
@ -2126,21 +2126,7 @@ func GetAPIServerURLsFromETCD(ctx context.Context, cfg *config.Control) ([]strin
|
|||
// GetMembersClientURLs will list through the member lists in etcd and return
|
||||
// back a combined list of client urls for each member in the cluster
|
||||
func (e *ETCD) GetMembersClientURLs(ctx context.Context) ([]string, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, testTimeout)
|
||||
defer cancel()
|
||||
|
||||
members, err := e.client.MemberList(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var memberUrls []string
|
||||
for _, member := range members.Members {
|
||||
for _, clientURL := range member.ClientURLs {
|
||||
memberUrls = append(memberUrls, string(clientURL))
|
||||
}
|
||||
}
|
||||
return memberUrls, nil
|
||||
return e.client.Endpoints(), nil
|
||||
}
|
||||
|
||||
// GetMembersNames will list through the member lists in etcd and return
|
||||
|
|
|
@ -1,103 +0,0 @@
|
|||
package server
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/k3s-io/k3s/pkg/etcd"
|
||||
"github.com/sirupsen/logrus"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
)
|
||||
|
||||
// setETCDLabelsAndAnnotations will set the etcd role label if not exists also it
|
||||
// sets special annotations on the node object which are etcd node id and etcd node
|
||||
// address, the function will also remove the controlplane and master role labels if
|
||||
// they exist on the node
|
||||
func setETCDLabelsAndAnnotations(ctx context.Context, config *Config) error {
|
||||
<-config.ControlConfig.Runtime.APIServerReady
|
||||
t := time.NewTicker(5 * time.Second)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
controlConfig := &config.ControlConfig
|
||||
|
||||
sc, err := NewContext(ctx, controlConfig.Runtime.KubeConfigAdmin)
|
||||
if err != nil {
|
||||
logrus.Infof("Failed to set etcd role label: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := sc.Start(ctx); err != nil {
|
||||
logrus.Infof("Failed to set etcd role label: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
controlConfig.Runtime.Core = sc.Core
|
||||
nodes := sc.Core.Core().V1().Node()
|
||||
|
||||
nodeName := os.Getenv("NODE_NAME")
|
||||
if nodeName == "" {
|
||||
logrus.Info("Failed to set etcd role label: node name not set")
|
||||
continue
|
||||
}
|
||||
node, err := nodes.Get(nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
logrus.Infof("Failed to set etcd role label: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if node.Labels == nil {
|
||||
node.Labels = make(map[string]string)
|
||||
}
|
||||
|
||||
// remove controlplane label if role label exists
|
||||
var controlRoleLabelExists bool
|
||||
if _, ok := node.Labels[MasterRoleLabelKey]; ok {
|
||||
delete(node.Labels, MasterRoleLabelKey)
|
||||
controlRoleLabelExists = true
|
||||
}
|
||||
if _, ok := node.Labels[ControlPlaneRoleLabelKey]; ok {
|
||||
delete(node.Labels, ControlPlaneRoleLabelKey)
|
||||
controlRoleLabelExists = true
|
||||
}
|
||||
|
||||
if v, ok := node.Labels[ETCDRoleLabelKey]; ok && v == "true" && !controlRoleLabelExists {
|
||||
break
|
||||
}
|
||||
|
||||
node.Labels[ETCDRoleLabelKey] = "true"
|
||||
|
||||
// this is replacement to the etcd controller handleself function
|
||||
if node.Annotations == nil {
|
||||
node.Annotations = map[string]string{}
|
||||
}
|
||||
fileName := filepath.Join(controlConfig.DataDir, "db", "etcd", "name")
|
||||
|
||||
data, err := os.ReadFile(fileName)
|
||||
if err != nil {
|
||||
logrus.Infof("Waiting for etcd node name file to be available: %v", err)
|
||||
continue
|
||||
}
|
||||
etcdNodeName := string(data)
|
||||
node.Annotations[etcd.NodeNameAnnotation] = etcdNodeName
|
||||
|
||||
address, err := etcd.GetAdvertiseAddress(controlConfig.PrivateIP)
|
||||
if err != nil {
|
||||
logrus.Infof("Waiting for etcd node address to be available: %v", err)
|
||||
continue
|
||||
}
|
||||
node.Annotations[etcd.NodeAddressAnnotation] = address
|
||||
|
||||
_, err = nodes.Update(node)
|
||||
if err == nil {
|
||||
logrus.Infof("Successfully set etcd role label and annotations on node %s", nodeName)
|
||||
break
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -78,11 +78,7 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error {
|
|||
}
|
||||
}
|
||||
|
||||
if config.ControlConfig.DisableAPIServer {
|
||||
go setETCDLabelsAndAnnotations(ctx, config)
|
||||
} else {
|
||||
go startOnAPIServerReady(ctx, config)
|
||||
}
|
||||
go startOnAPIServerReady(ctx, config)
|
||||
|
||||
if err := printTokens(&config.ControlConfig); err != nil {
|
||||
return err
|
||||
|
@ -138,20 +134,9 @@ func runControllers(ctx context.Context, config *Config) error {
|
|||
return errors.Wrap(err, "failed to start wranger controllers")
|
||||
}
|
||||
|
||||
controlConfig.Runtime.LeaderElectedClusterControllerStarts[version.Program] = func(ctx context.Context) {
|
||||
if controlConfig.DisableAPIServer {
|
||||
return
|
||||
}
|
||||
if err := coreControllers(ctx, sc, config); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
for _, controller := range config.LeaderControllers {
|
||||
if err := controller(ctx, sc); err != nil {
|
||||
panic(errors.Wrapf(err, "failed to start %s leader controller", util.GetFunctionName(controller)))
|
||||
}
|
||||
}
|
||||
if err := sc.Start(ctx); err != nil {
|
||||
panic(err)
|
||||
if !controlConfig.DisableAPIServer {
|
||||
controlConfig.Runtime.LeaderElectedClusterControllerStarts[version.Program] = func(ctx context.Context) {
|
||||
apiserverControllers(ctx, sc, config)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -172,6 +157,22 @@ func runControllers(ctx context.Context, config *Config) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// apiServerControllers starts the core controllers, as well as the leader-elected controllers
|
||||
// that should only run on a control-plane node.
|
||||
func apiserverControllers(ctx context.Context, sc *Context, config *Config) {
|
||||
if err := coreControllers(ctx, sc, config); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
for _, controller := range config.LeaderControllers {
|
||||
if err := controller(ctx, sc); err != nil {
|
||||
panic(errors.Wrapf(err, "failed to start %s leader controller", util.GetFunctionName(controller)))
|
||||
}
|
||||
}
|
||||
if err := sc.Start(ctx); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// runOrDie is similar to leader.RunOrDie, except that it runs the callback
|
||||
// immediately, without performing leader election.
|
||||
func runOrDie(ctx context.Context, name string, cb leader.Callback) {
|
||||
|
@ -184,6 +185,12 @@ func runOrDie(ctx context.Context, name string, cb leader.Callback) {
|
|||
<-ctx.Done()
|
||||
}
|
||||
|
||||
// coreControllers starts the following controllers, if they are enabled:
|
||||
// * Node controller (manages nodes passwords and coredns hosts file)
|
||||
// * Helm controller
|
||||
// * Secrets encryption
|
||||
// * Rootless ports
|
||||
// These controllers should only be run on nodes with a local apiserver
|
||||
func coreControllers(ctx context.Context, sc *Context, config *Config) error {
|
||||
if err := node.Register(ctx,
|
||||
!config.ControlConfig.Skips["coredns"],
|
||||
|
@ -237,6 +244,9 @@ func coreControllers(ctx context.Context, sc *Context, config *Config) error {
|
|||
}
|
||||
|
||||
func stageFiles(ctx context.Context, sc *Context, controlConfig *config.Control) error {
|
||||
if controlConfig.DisableAPIServer {
|
||||
return nil
|
||||
}
|
||||
dataDir := filepath.Join(controlConfig.DataDir, "static")
|
||||
if err := static.Stage(dataDir); err != nil {
|
||||
return err
|
||||
|
@ -527,19 +537,11 @@ func setNodeLabelsAndAnnotations(ctx context.Context, nodes v1.NodeClient, confi
|
|||
time.Sleep(1 * time.Second)
|
||||
continue
|
||||
}
|
||||
// remove etcd label if etcd is disabled
|
||||
var etcdRoleLabelExists bool
|
||||
if config.ControlConfig.DisableETCD {
|
||||
if _, ok := node.Labels[ETCDRoleLabelKey]; ok {
|
||||
delete(node.Labels, ETCDRoleLabelKey)
|
||||
etcdRoleLabelExists = true
|
||||
}
|
||||
}
|
||||
if node.Labels == nil {
|
||||
node.Labels = make(map[string]string)
|
||||
}
|
||||
v, ok := node.Labels[ControlPlaneRoleLabelKey]
|
||||
if !ok || v != "true" || etcdRoleLabelExists {
|
||||
if !ok || v != "true" {
|
||||
node.Labels[ControlPlaneRoleLabelKey] = "true"
|
||||
node.Labels[MasterRoleLabelKey] = "true"
|
||||
}
|
||||
|
@ -565,15 +567,18 @@ func setNodeLabelsAndAnnotations(ctx context.Context, nodes v1.NodeClient, confi
|
|||
return nil
|
||||
}
|
||||
|
||||
func setClusterDNSConfig(ctx context.Context, controlConfig *Config, configMap v1.ConfigMapClient) error {
|
||||
func setClusterDNSConfig(ctx context.Context, config *Config, configMap v1.ConfigMapClient) error {
|
||||
if config.ControlConfig.DisableAPIServer {
|
||||
return nil
|
||||
}
|
||||
// check if configmap already exists
|
||||
_, err := configMap.Get("kube-system", "cluster-dns", metav1.GetOptions{})
|
||||
if err == nil {
|
||||
logrus.Infof("Cluster dns configmap already exists")
|
||||
return nil
|
||||
}
|
||||
clusterDNS := controlConfig.ControlConfig.ClusterDNS
|
||||
clusterDomain := controlConfig.ControlConfig.ClusterDomain
|
||||
clusterDNS := config.ControlConfig.ClusterDNS
|
||||
clusterDomain := config.ControlConfig.ClusterDomain
|
||||
c := &corev1.ConfigMap{
|
||||
TypeMeta: metav1.TypeMeta{
|
||||
Kind: "ConfigMap",
|
||||
|
|
Loading…
Reference in New Issue