mirror of https://github.com/k3s-io/k3s
Fix etcd member deletion
Turns out etcd-only nodes were never running **any** of the controllers, so allowing multiple controllers didn't really fix things. Signed-off-by: Brad Davidson <brad.davidson@rancher.com>pull/6975/head
parent
14f2226b67
commit
4e03608119
|
@ -144,7 +144,8 @@ func (c *Cluster) assignManagedDriver(ctx context.Context) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// setupEtcdProxy
|
// setupEtcdProxy periodically updates the etcd proxy with the current list of
|
||||||
|
// cluster client URLs, as retrieved from etcd.
|
||||||
func (c *Cluster) setupEtcdProxy(ctx context.Context, etcdProxy etcd.Proxy) {
|
func (c *Cluster) setupEtcdProxy(ctx context.Context, etcdProxy etcd.Proxy) {
|
||||||
if c.managedDB == nil {
|
if c.managedDB == nil {
|
||||||
return
|
return
|
||||||
|
|
|
@ -38,11 +38,13 @@ func (e *Embedded) ETCD(ctx context.Context, args ETCDConfig, extraArgs []string
|
||||||
if errors.Is(err, rafthttp.ErrMemberRemoved) {
|
if errors.Is(err, rafthttp.ErrMemberRemoved) {
|
||||||
tombstoneFile := filepath.Join(args.DataDir, "tombstone")
|
tombstoneFile := filepath.Join(args.DataDir, "tombstone")
|
||||||
if err := os.WriteFile(tombstoneFile, []byte{}, 0600); err != nil {
|
if err := os.WriteFile(tombstoneFile, []byte{}, 0600); err != nil {
|
||||||
logrus.Fatalf("failed to write tombstone file to %s", tombstoneFile)
|
logrus.Fatalf("Failed to write tombstone file to %s: %v", tombstoneFile, err)
|
||||||
}
|
}
|
||||||
logrus.Infof("this node has been removed from the cluster please restart %s to rejoin the cluster", version.Program)
|
etcd.Close()
|
||||||
|
logrus.Infof("This node has been removed from the cluster - please restart %s to rejoin the cluster", version.Program)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
logrus.Errorf("etcd error: %v", err)
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
logrus.Infof("stopping etcd")
|
logrus.Infof("stopping etcd")
|
||||||
etcd.Close()
|
etcd.Close()
|
||||||
|
|
|
@ -554,15 +554,17 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
|
||||||
}
|
}
|
||||||
|
|
||||||
// The apiserver endpoint controller needs to run on a node with a local apiserver,
|
// The apiserver endpoint controller needs to run on a node with a local apiserver,
|
||||||
// in order to successfully seed etcd with the endpoint list.
|
// in order to successfully seed etcd with the endpoint list. The member removal controller
|
||||||
|
// also needs to run on a non-etcd node as to avoid disruption if running on the node that
|
||||||
|
// is being removed from the cluster.
|
||||||
if !e.config.DisableAPIServer {
|
if !e.config.DisableAPIServer {
|
||||||
e.config.Runtime.LeaderElectedClusterControllerStarts["etcd-apiserver-endpoints"] = func(ctx context.Context) {
|
e.config.Runtime.LeaderElectedClusterControllerStarts[version.Program+"-etcd"] = func(ctx context.Context) {
|
||||||
registerEndpointsHandlers(ctx, e)
|
registerEndpointsHandlers(ctx, e)
|
||||||
|
registerMemberHandlers(ctx, e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// The etcd member-removal controllers should only run on an etcd node. Tombstone file checking
|
// Tombstone file checking is unnecessary if we're not running etcd.
|
||||||
// is also unnecessary if we're not running etcd.
|
|
||||||
if !e.config.DisableETCD {
|
if !e.config.DisableETCD {
|
||||||
tombstoneFile := filepath.Join(DBDir(e.config), "tombstone")
|
tombstoneFile := filepath.Join(DBDir(e.config), "tombstone")
|
||||||
if _, err := os.Stat(tombstoneFile); err == nil {
|
if _, err := os.Stat(tombstoneFile); err == nil {
|
||||||
|
@ -575,10 +577,6 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
|
||||||
if err := e.setName(false); err != nil {
|
if err := e.setName(false); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
e.config.Runtime.LeaderElectedClusterControllerStarts["etcd-member-removal"] = func(ctx context.Context) {
|
|
||||||
registerMemberHandlers(ctx, e)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return e.handler(handler), nil
|
return e.handler(handler), nil
|
||||||
|
@ -666,6 +664,8 @@ func getClientConfig(ctx context.Context, control *config.Control, endpoints ...
|
||||||
DialTimeout: defaultDialTimeout,
|
DialTimeout: defaultDialTimeout,
|
||||||
DialKeepAliveTime: defaultKeepAliveTime,
|
DialKeepAliveTime: defaultKeepAliveTime,
|
||||||
DialKeepAliveTimeout: defaultKeepAliveTimeout,
|
DialKeepAliveTimeout: defaultKeepAliveTimeout,
|
||||||
|
AutoSyncInterval: defaultKeepAliveTimeout,
|
||||||
|
PermitWithoutStream: true,
|
||||||
}
|
}
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
|
@ -2126,21 +2126,7 @@ func GetAPIServerURLsFromETCD(ctx context.Context, cfg *config.Control) ([]strin
|
||||||
// GetMembersClientURLs will list through the member lists in etcd and return
|
// GetMembersClientURLs will list through the member lists in etcd and return
|
||||||
// back a combined list of client urls for each member in the cluster
|
// back a combined list of client urls for each member in the cluster
|
||||||
func (e *ETCD) GetMembersClientURLs(ctx context.Context) ([]string, error) {
|
func (e *ETCD) GetMembersClientURLs(ctx context.Context) ([]string, error) {
|
||||||
ctx, cancel := context.WithTimeout(ctx, testTimeout)
|
return e.client.Endpoints(), nil
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
members, err := e.client.MemberList(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var memberUrls []string
|
|
||||||
for _, member := range members.Members {
|
|
||||||
for _, clientURL := range member.ClientURLs {
|
|
||||||
memberUrls = append(memberUrls, string(clientURL))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return memberUrls, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetMembersNames will list through the member lists in etcd and return
|
// GetMembersNames will list through the member lists in etcd and return
|
||||||
|
|
|
@ -1,103 +0,0 @@
|
||||||
package server
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/k3s-io/k3s/pkg/etcd"
|
|
||||||
"github.com/sirupsen/logrus"
|
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
)
|
|
||||||
|
|
||||||
// setETCDLabelsAndAnnotations will set the etcd role label if not exists also it
|
|
||||||
// sets special annotations on the node object which are etcd node id and etcd node
|
|
||||||
// address, the function will also remove the controlplane and master role labels if
|
|
||||||
// they exist on the node
|
|
||||||
func setETCDLabelsAndAnnotations(ctx context.Context, config *Config) error {
|
|
||||||
<-config.ControlConfig.Runtime.APIServerReady
|
|
||||||
t := time.NewTicker(5 * time.Second)
|
|
||||||
defer t.Stop()
|
|
||||||
for range t.C {
|
|
||||||
controlConfig := &config.ControlConfig
|
|
||||||
|
|
||||||
sc, err := NewContext(ctx, controlConfig.Runtime.KubeConfigAdmin)
|
|
||||||
if err != nil {
|
|
||||||
logrus.Infof("Failed to set etcd role label: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := sc.Start(ctx); err != nil {
|
|
||||||
logrus.Infof("Failed to set etcd role label: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
controlConfig.Runtime.Core = sc.Core
|
|
||||||
nodes := sc.Core.Core().V1().Node()
|
|
||||||
|
|
||||||
nodeName := os.Getenv("NODE_NAME")
|
|
||||||
if nodeName == "" {
|
|
||||||
logrus.Info("Failed to set etcd role label: node name not set")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
node, err := nodes.Get(nodeName, metav1.GetOptions{})
|
|
||||||
if err != nil {
|
|
||||||
logrus.Infof("Failed to set etcd role label: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if node.Labels == nil {
|
|
||||||
node.Labels = make(map[string]string)
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove controlplane label if role label exists
|
|
||||||
var controlRoleLabelExists bool
|
|
||||||
if _, ok := node.Labels[MasterRoleLabelKey]; ok {
|
|
||||||
delete(node.Labels, MasterRoleLabelKey)
|
|
||||||
controlRoleLabelExists = true
|
|
||||||
}
|
|
||||||
if _, ok := node.Labels[ControlPlaneRoleLabelKey]; ok {
|
|
||||||
delete(node.Labels, ControlPlaneRoleLabelKey)
|
|
||||||
controlRoleLabelExists = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if v, ok := node.Labels[ETCDRoleLabelKey]; ok && v == "true" && !controlRoleLabelExists {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
node.Labels[ETCDRoleLabelKey] = "true"
|
|
||||||
|
|
||||||
// this is replacement to the etcd controller handleself function
|
|
||||||
if node.Annotations == nil {
|
|
||||||
node.Annotations = map[string]string{}
|
|
||||||
}
|
|
||||||
fileName := filepath.Join(controlConfig.DataDir, "db", "etcd", "name")
|
|
||||||
|
|
||||||
data, err := os.ReadFile(fileName)
|
|
||||||
if err != nil {
|
|
||||||
logrus.Infof("Waiting for etcd node name file to be available: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
etcdNodeName := string(data)
|
|
||||||
node.Annotations[etcd.NodeNameAnnotation] = etcdNodeName
|
|
||||||
|
|
||||||
address, err := etcd.GetAdvertiseAddress(controlConfig.PrivateIP)
|
|
||||||
if err != nil {
|
|
||||||
logrus.Infof("Waiting for etcd node address to be available: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
node.Annotations[etcd.NodeAddressAnnotation] = address
|
|
||||||
|
|
||||||
_, err = nodes.Update(node)
|
|
||||||
if err == nil {
|
|
||||||
logrus.Infof("Successfully set etcd role label and annotations on node %s", nodeName)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
|
@ -78,11 +78,7 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if config.ControlConfig.DisableAPIServer {
|
go startOnAPIServerReady(ctx, config)
|
||||||
go setETCDLabelsAndAnnotations(ctx, config)
|
|
||||||
} else {
|
|
||||||
go startOnAPIServerReady(ctx, config)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := printTokens(&config.ControlConfig); err != nil {
|
if err := printTokens(&config.ControlConfig); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -138,20 +134,9 @@ func runControllers(ctx context.Context, config *Config) error {
|
||||||
return errors.Wrap(err, "failed to start wranger controllers")
|
return errors.Wrap(err, "failed to start wranger controllers")
|
||||||
}
|
}
|
||||||
|
|
||||||
controlConfig.Runtime.LeaderElectedClusterControllerStarts[version.Program] = func(ctx context.Context) {
|
if !controlConfig.DisableAPIServer {
|
||||||
if controlConfig.DisableAPIServer {
|
controlConfig.Runtime.LeaderElectedClusterControllerStarts[version.Program] = func(ctx context.Context) {
|
||||||
return
|
apiserverControllers(ctx, sc, config)
|
||||||
}
|
|
||||||
if err := coreControllers(ctx, sc, config); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
for _, controller := range config.LeaderControllers {
|
|
||||||
if err := controller(ctx, sc); err != nil {
|
|
||||||
panic(errors.Wrapf(err, "failed to start %s leader controller", util.GetFunctionName(controller)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err := sc.Start(ctx); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -172,6 +157,22 @@ func runControllers(ctx context.Context, config *Config) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// apiServerControllers starts the core controllers, as well as the leader-elected controllers
|
||||||
|
// that should only run on a control-plane node.
|
||||||
|
func apiserverControllers(ctx context.Context, sc *Context, config *Config) {
|
||||||
|
if err := coreControllers(ctx, sc, config); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
for _, controller := range config.LeaderControllers {
|
||||||
|
if err := controller(ctx, sc); err != nil {
|
||||||
|
panic(errors.Wrapf(err, "failed to start %s leader controller", util.GetFunctionName(controller)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := sc.Start(ctx); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// runOrDie is similar to leader.RunOrDie, except that it runs the callback
|
// runOrDie is similar to leader.RunOrDie, except that it runs the callback
|
||||||
// immediately, without performing leader election.
|
// immediately, without performing leader election.
|
||||||
func runOrDie(ctx context.Context, name string, cb leader.Callback) {
|
func runOrDie(ctx context.Context, name string, cb leader.Callback) {
|
||||||
|
@ -184,6 +185,12 @@ func runOrDie(ctx context.Context, name string, cb leader.Callback) {
|
||||||
<-ctx.Done()
|
<-ctx.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// coreControllers starts the following controllers, if they are enabled:
|
||||||
|
// * Node controller (manages nodes passwords and coredns hosts file)
|
||||||
|
// * Helm controller
|
||||||
|
// * Secrets encryption
|
||||||
|
// * Rootless ports
|
||||||
|
// These controllers should only be run on nodes with a local apiserver
|
||||||
func coreControllers(ctx context.Context, sc *Context, config *Config) error {
|
func coreControllers(ctx context.Context, sc *Context, config *Config) error {
|
||||||
if err := node.Register(ctx,
|
if err := node.Register(ctx,
|
||||||
!config.ControlConfig.Skips["coredns"],
|
!config.ControlConfig.Skips["coredns"],
|
||||||
|
@ -237,6 +244,9 @@ func coreControllers(ctx context.Context, sc *Context, config *Config) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func stageFiles(ctx context.Context, sc *Context, controlConfig *config.Control) error {
|
func stageFiles(ctx context.Context, sc *Context, controlConfig *config.Control) error {
|
||||||
|
if controlConfig.DisableAPIServer {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
dataDir := filepath.Join(controlConfig.DataDir, "static")
|
dataDir := filepath.Join(controlConfig.DataDir, "static")
|
||||||
if err := static.Stage(dataDir); err != nil {
|
if err := static.Stage(dataDir); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -527,19 +537,11 @@ func setNodeLabelsAndAnnotations(ctx context.Context, nodes v1.NodeClient, confi
|
||||||
time.Sleep(1 * time.Second)
|
time.Sleep(1 * time.Second)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// remove etcd label if etcd is disabled
|
|
||||||
var etcdRoleLabelExists bool
|
|
||||||
if config.ControlConfig.DisableETCD {
|
|
||||||
if _, ok := node.Labels[ETCDRoleLabelKey]; ok {
|
|
||||||
delete(node.Labels, ETCDRoleLabelKey)
|
|
||||||
etcdRoleLabelExists = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if node.Labels == nil {
|
if node.Labels == nil {
|
||||||
node.Labels = make(map[string]string)
|
node.Labels = make(map[string]string)
|
||||||
}
|
}
|
||||||
v, ok := node.Labels[ControlPlaneRoleLabelKey]
|
v, ok := node.Labels[ControlPlaneRoleLabelKey]
|
||||||
if !ok || v != "true" || etcdRoleLabelExists {
|
if !ok || v != "true" {
|
||||||
node.Labels[ControlPlaneRoleLabelKey] = "true"
|
node.Labels[ControlPlaneRoleLabelKey] = "true"
|
||||||
node.Labels[MasterRoleLabelKey] = "true"
|
node.Labels[MasterRoleLabelKey] = "true"
|
||||||
}
|
}
|
||||||
|
@ -565,15 +567,18 @@ func setNodeLabelsAndAnnotations(ctx context.Context, nodes v1.NodeClient, confi
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func setClusterDNSConfig(ctx context.Context, controlConfig *Config, configMap v1.ConfigMapClient) error {
|
func setClusterDNSConfig(ctx context.Context, config *Config, configMap v1.ConfigMapClient) error {
|
||||||
|
if config.ControlConfig.DisableAPIServer {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
// check if configmap already exists
|
// check if configmap already exists
|
||||||
_, err := configMap.Get("kube-system", "cluster-dns", metav1.GetOptions{})
|
_, err := configMap.Get("kube-system", "cluster-dns", metav1.GetOptions{})
|
||||||
if err == nil {
|
if err == nil {
|
||||||
logrus.Infof("Cluster dns configmap already exists")
|
logrus.Infof("Cluster dns configmap already exists")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
clusterDNS := controlConfig.ControlConfig.ClusterDNS
|
clusterDNS := config.ControlConfig.ClusterDNS
|
||||||
clusterDomain := controlConfig.ControlConfig.ClusterDomain
|
clusterDomain := config.ControlConfig.ClusterDomain
|
||||||
c := &corev1.ConfigMap{
|
c := &corev1.ConfigMap{
|
||||||
TypeMeta: metav1.TypeMeta{
|
TypeMeta: metav1.TypeMeta{
|
||||||
Kind: "ConfigMap",
|
Kind: "ConfigMap",
|
||||||
|
|
Loading…
Reference in New Issue