From a9b5a1933fbd773062be1433d056df07f138a7bd Mon Sep 17 00:00:00 2001 From: Derek Nola Date: Wed, 15 Jun 2022 09:00:52 -0700 Subject: [PATCH] Delay service readiness until after startuphooks have finished (#5649) * Move startup hooks wg into a runtime pointer, check before notifying systemd * Switch default systemd notification to server * Add 1 sec delay to allow etcd to write to disk Signed-off-by: Derek Nola --- pkg/agent/run.go | 10 ++++++++-- pkg/cli/server/server.go | 11 ++++++----- pkg/daemons/config/types.go | 2 ++ pkg/server/secrets-encrypt.go | 4 ++++ pkg/server/server.go | 12 +++++++----- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pkg/agent/run.go b/pkg/agent/run.go index 753d01dba3..1229c2194a 100644 --- a/pkg/agent/run.go +++ b/pkg/agent/run.go @@ -28,6 +28,7 @@ import ( "github.com/k3s-io/k3s/pkg/nodeconfig" "github.com/k3s-io/k3s/pkg/rootless" "github.com/k3s-io/k3s/pkg/util" + "github.com/k3s-io/k3s/pkg/version" "github.com/pkg/errors" "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" @@ -146,8 +147,13 @@ func run(ctx context.Context, cfg cmds.Agent, proxy proxy.Proxy) error { } } - os.Setenv("NOTIFY_SOCKET", notifySocket) - systemd.SdNotify(true, "READY=1\n") + // By default, the server is responsible for notifying systemd + // On agent-only nodes, the agent will notify systemd + if notifySocket != "" { + logrus.Info(version.Program + " agent is up and running") + os.Setenv("NOTIFY_SOCKET", notifySocket) + systemd.SdNotify(true, "READY=1\n") + } <-ctx.Done() return ctx.Err() diff --git a/pkg/cli/server/server.go b/pkg/cli/server/server.go index fde31569c4..8d50d8c1fe 100644 --- a/pkg/cli/server/server.go +++ b/pkg/cli/server/server.go @@ -444,6 +444,7 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont logrus.Info("Starting " + version.Program + " " + app.App.Version) notifySocket := os.Getenv("NOTIFY_SOCKET") + os.Unsetenv("NOTIFY_SOCKET") ctx := signals.SetupSignalContext() @@ -455,16 +456,16 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont if !serverConfig.ControlConfig.DisableAPIServer { <-serverConfig.ControlConfig.Runtime.APIServerReady logrus.Info("Kube API server is now running") - } else { + serverConfig.ControlConfig.Runtime.StartupHooksWg.Wait() + } + if !serverConfig.ControlConfig.DisableETCD { <-serverConfig.ControlConfig.Runtime.ETCDReady logrus.Info("ETCD server is now running") } logrus.Info(version.Program + " is up and running") - if (cfg.DisableAgent || cfg.DisableAPIServer) && notifySocket != "" { - os.Setenv("NOTIFY_SOCKET", notifySocket) - systemd.SdNotify(true, "READY=1\n") - } + os.Setenv("NOTIFY_SOCKET", notifySocket) + systemd.SdNotify(true, "READY=1\n") }() url := fmt.Sprintf("https://%s:%d", serverConfig.ControlConfig.BindAddressOrLoopback(false), serverConfig.ControlConfig.SupervisorPort) diff --git a/pkg/daemons/config/types.go b/pkg/daemons/config/types.go index d296761be9..b7cf18b9e2 100644 --- a/pkg/daemons/config/types.go +++ b/pkg/daemons/config/types.go @@ -8,6 +8,7 @@ import ( "net/http" "sort" "strings" + "sync" "time" "github.com/k3s-io/k3s/pkg/util" @@ -269,6 +270,7 @@ type ControlRuntime struct { APIServerReady <-chan struct{} AgentReady <-chan struct{} ETCDReady <-chan struct{} + StartupHooksWg *sync.WaitGroup ClusterControllerStart func(ctx context.Context) error LeaderElectedClusterControllerStart func(ctx context.Context) error diff --git a/pkg/server/secrets-encrypt.go b/pkg/server/secrets-encrypt.go index 610c73b1cd..0db520bef8 100644 --- a/pkg/server/secrets-encrypt.go +++ b/pkg/server/secrets-encrypt.go @@ -186,6 +186,10 @@ func encryptionConfigHandler(ctx context.Context, server *config.Control) http.H genErrorMessage(resp, http.StatusBadRequest, err) return } + // If a user kills the k3s server immediately after this call, we run into issues where the files + // have not yet been written. This sleep ensures that things have time to sync to disk before + // the request completes. + time.Sleep(1 * time.Second) resp.WriteHeader(http.StatusOK) }) } diff --git a/pkg/server/server.go b/pkg/server/server.go index ffdcb13a15..97f639ba20 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -64,6 +64,8 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error { wg.Add(len(config.StartupHooks)) config.ControlConfig.Runtime.Handler = router(ctx, config, cfg) + config.ControlConfig.Runtime.StartupHooksWg = wg + shArgs := cmds.StartupHookArgs{ APIServerReady: config.ControlConfig.Runtime.APIServerReady, KubeConfigAdmin: config.ControlConfig.Runtime.KubeConfigAdmin, @@ -79,7 +81,7 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error { if config.ControlConfig.DisableAPIServer { go setETCDLabelsAndAnnotations(ctx, config) } else { - go startOnAPIServerReady(ctx, wg, config) + go startOnAPIServerReady(ctx, config) } if err := printTokens(&config.ControlConfig); err != nil { @@ -89,18 +91,18 @@ func StartServer(ctx context.Context, config *Config, cfg *cmds.Server) error { return writeKubeConfig(config.ControlConfig.Runtime.ServerCA, config) } -func startOnAPIServerReady(ctx context.Context, wg *sync.WaitGroup, config *Config) { +func startOnAPIServerReady(ctx context.Context, config *Config) { select { case <-ctx.Done(): return case <-config.ControlConfig.Runtime.APIServerReady: - if err := runControllers(ctx, wg, config); err != nil { + if err := runControllers(ctx, config); err != nil { logrus.Fatalf("failed to start controllers: %v", err) } } } -func runControllers(ctx context.Context, wg *sync.WaitGroup, config *Config) error { +func runControllers(ctx context.Context, config *Config) error { controlConfig := &config.ControlConfig sc, err := NewContext(ctx, controlConfig.Runtime.KubeConfigAdmin) @@ -108,7 +110,7 @@ func runControllers(ctx context.Context, wg *sync.WaitGroup, config *Config) err return errors.Wrap(err, "failed to create new server context") } - wg.Wait() + controlConfig.Runtime.StartupHooksWg.Wait() if err := stageFiles(ctx, sc, controlConfig); err != nil { return errors.Wrap(err, "failed to stage files") }