From 418c3fa858b69b12b9cefbcff0526f666a6236b9 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Wed, 27 Apr 2022 13:44:15 -0700 Subject: [PATCH] Fix issue with datastore corruption on cluster-reset (#5515) * Bump etcd to v3.5.4-k3s1 * Fix issue with datastore corruption on cluster-reset * Disable unnecessary components during cluster reset Disable control-plane components and the tunnel setup during cluster-reset, even when not doing a restore. This reduces the amount of log clutter during cluster reset/restore, making any errors encountered more obvious. Signed-off-by: Brad Davidson --- go.mod | 23 +++++----- go.sum | 43 +++++++++---------- pkg/agent/run.go | 16 +++++-- pkg/cli/server/server.go | 30 ++++++++----- pkg/cluster/managed.go | 29 +++++++------ pkg/daemons/executor/executor.go | 31 ++++++------- pkg/etcd/etcd.go | 41 +++++++++++------- .../etcdrestore/etcd_restore_int_test.go | 2 +- 8 files changed, 122 insertions(+), 93 deletions(-) diff --git a/go.mod b/go.mod index 85ebb460fe..79275ecd1e 100644 --- a/go.mod +++ b/go.mod @@ -21,11 +21,14 @@ replace ( github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 github.com/opencontainers/selinux => github.com/opencontainers/selinux v1.10.1 github.com/rancher/wrangler => github.com/rancher/wrangler v0.8.11-0.20220211163748-d5a8ee98be5f - go.etcd.io/etcd/api/v3 => github.com/k3s-io/etcd/api/v3 v3.5.3-k3s1 - go.etcd.io/etcd/client/pkg/v3 => github.com/k3s-io/etcd/client/pkg/v3 v3.5.3-k3s1 - go.etcd.io/etcd/client/v3 => github.com/k3s-io/etcd/client/v3 v3.5.3-k3s1 - go.etcd.io/etcd/etcdutl/v3 => github.com/k3s-io/etcd/etcdutl/v3 v3.5.3-k3s1 - go.etcd.io/etcd/server/v3 => github.com/k3s-io/etcd/server/v3 v3.5.3-k3s1 + go.etcd.io/etcd => github.com/k3s-io/etcd v3.4.18-k3s1+incompatible + go.etcd.io/etcd/api/v3 => github.com/k3s-io/etcd/api/v3 v3.5.4-k3s1 + go.etcd.io/etcd/client/pkg/v3 => github.com/k3s-io/etcd/client/pkg/v3 v3.5.4-k3s1 + go.etcd.io/etcd/client/v3 => github.com/k3s-io/etcd/client/v3 v3.5.4-k3s1 + go.etcd.io/etcd/etcdutl/v3 => github.com/k3s-io/etcd/etcdutl/v3 v3.5.4-k3s1 + go.etcd.io/etcd/pkg/v3 => github.com/k3s-io/etcd/pkg/v3 v3.5.4-k3s1 + go.etcd.io/etcd/raft/v3 => github.com/k3s-io/etcd/raft/v3 v3.5.4-k3s1 + go.etcd.io/etcd/server/v3 => github.com/k3s-io/etcd/server/v3 v3.5.4-k3s1 golang.org/x/crypto => golang.org/x/crypto v0.0.0-20210817164053-32db794688a5 golang.org/x/net => golang.org/x/net v0.0.0-20210825183410-e898025ed96a golang.org/x/sys => golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 @@ -112,11 +115,11 @@ require ( github.com/tchap/go-patricia v2.3.0+incompatible // indirect github.com/urfave/cli v1.22.4 github.com/vishvananda/netlink v1.1.1-0.20210330154013-f5de75959ad5 - go.etcd.io/etcd/api/v3 v3.5.3 - go.etcd.io/etcd/client/pkg/v3 v3.5.3 - go.etcd.io/etcd/client/v3 v3.5.3 - go.etcd.io/etcd/etcdutl/v3 v3.5.3 - go.etcd.io/etcd/server/v3 v3.5.3 + go.etcd.io/etcd/api/v3 v3.5.4 + go.etcd.io/etcd/client/pkg/v3 v3.5.4 + go.etcd.io/etcd/client/v3 v3.5.4 + go.etcd.io/etcd/etcdutl/v3 v3.5.4 + go.etcd.io/etcd/server/v3 v3.5.4 go.uber.org/zap v1.19.0 golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 golang.org/x/net v0.0.0-20211216030914-fe4d6282115f diff --git a/go.sum b/go.sum index 8baf18bf7e..c2f7b64eb4 100644 --- a/go.sum +++ b/go.sum @@ -282,7 +282,6 @@ github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:ma github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.1 h1:r/myEWzV9lfsM1tFLgDyu0atFtJ1fXn261LKYj/3DxU= github.com/cpuguy83/go-md2man/v2 v2.0.1/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.11 h1:07n33Z8lZxZ2qwegKbObQohDhXDQxiMMz1NOUGYlesw= github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -481,6 +480,7 @@ github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zV github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang-jwt/jwt v3.2.1+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= @@ -582,7 +582,6 @@ github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2z github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= -github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= @@ -714,16 +713,22 @@ github.com/k3s-io/containerd v1.5.11-k3s2 h1:9v+xY6pHzpMwECuQYd1P/Z6KiUsTvOj+adZ github.com/k3s-io/containerd v1.5.11-k3s2/go.mod h1:OiGmBp1wc/meLzuMvhogZdKaSWUQ0un+mwqkNxth/mY= github.com/k3s-io/cri-tools v1.22.0-k3s1 h1:a+7srHjpHVtCgYT4dtZQUTThLfo8xWR2SR53+JtKa0I= github.com/k3s-io/cri-tools v1.22.0-k3s1/go.mod h1:06cMrzcMIFKwDJziI2YyLdA0NVzN054GMRRWsL+NRPk= -github.com/k3s-io/etcd/api/v3 v3.5.3-k3s1 h1:XY2oUIIy2+DR7zXk/BVqQ4f2qFHbd2VTAGrnrT4IxhA= -github.com/k3s-io/etcd/api/v3 v3.5.3-k3s1/go.mod h1:5GB2vv4A4AOn3yk7MftYGHkUfGtDHnEraIjym4dYz5A= -github.com/k3s-io/etcd/client/pkg/v3 v3.5.3-k3s1 h1:MrI6IyT4Q8tkoY2kO3z2dpjWABNVtw1tdsXj3ozdMOs= -github.com/k3s-io/etcd/client/pkg/v3 v3.5.3-k3s1/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g= -github.com/k3s-io/etcd/client/v3 v3.5.3-k3s1 h1:jufl5K+1x0cwtb8gRziPbP+EHpy+KCKc0E4ojWcjSQY= -github.com/k3s-io/etcd/client/v3 v3.5.3-k3s1/go.mod h1:S9LzGLV7Kh1Rg85nMVMjloLdUSMu+wvZZXPcUXDQ2Ds= -github.com/k3s-io/etcd/etcdutl/v3 v3.5.3-k3s1 h1:uPCEhACmVoFJnI9Izb2Y+UKRS5JM/F/90XTbTjsUsFQ= -github.com/k3s-io/etcd/etcdutl/v3 v3.5.3-k3s1/go.mod h1:F9bW3+f+cKZIcjWHkdfJ3lnCajF4FnPh7DjcgiBRb7g= -github.com/k3s-io/etcd/server/v3 v3.5.3-k3s1 h1:MVTrb5cp75kSMA9K240VMa5I+GKuYYP9xN/Nj+US27w= -github.com/k3s-io/etcd/server/v3 v3.5.3-k3s1/go.mod h1:xwZlQLuAWsWw5rpb/Gwzi3nFie9STKcrKQbM6evLi5g= +github.com/k3s-io/etcd v3.4.18-k3s1+incompatible h1:tUpMsW3V/iddqXsO6lQWJ0Vql4gKu+ILiP4BhKEu5ls= +github.com/k3s-io/etcd v3.4.18-k3s1+incompatible/go.mod h1:t1cqOhpjW3SEYhH7Wzlg51xzyIM2c5HMB9kvPO5k4gY= +github.com/k3s-io/etcd/api/v3 v3.5.4-k3s1 h1:Ac7cbUC5A0E4mlvbNne82N1x5ROSwXIrzXhBGR0cg94= +github.com/k3s-io/etcd/api/v3 v3.5.4-k3s1/go.mod h1:5GB2vv4A4AOn3yk7MftYGHkUfGtDHnEraIjym4dYz5A= +github.com/k3s-io/etcd/client/pkg/v3 v3.5.4-k3s1 h1:CEtCycRg3iAjMJOOn+IKrzoiiyb/FQscNsCh6Fr7FwI= +github.com/k3s-io/etcd/client/pkg/v3 v3.5.4-k3s1/go.mod h1:IJHfcCEKxYu1Os13ZdwCwIUTUVGYTSAM3YSwc9/Ac1g= +github.com/k3s-io/etcd/client/v3 v3.5.4-k3s1 h1:I2ODKgHYCI3i/8kuWpLASKfDMi7WEWNZghKMe4miCM8= +github.com/k3s-io/etcd/client/v3 v3.5.4-k3s1/go.mod h1:ZaRkVgBZC+L+dLCjTcF1hRXpgZXQPOvnA/Ak/gq3kiY= +github.com/k3s-io/etcd/etcdutl/v3 v3.5.4-k3s1 h1:qmdKtJ31UX00XRnFL6/FeprfslgRvs1tRggin8HvFXQ= +github.com/k3s-io/etcd/etcdutl/v3 v3.5.4-k3s1/go.mod h1:eK9eZfI/BxDQCztpuaJ1E/ufYpMw2Y16dPX1azGWrBU= +github.com/k3s-io/etcd/pkg/v3 v3.5.4-k3s1 h1:GhGXwBpxDo0dMymOaFepu96SwfmRqxGwVzgcPPCNFxo= +github.com/k3s-io/etcd/pkg/v3 v3.5.4-k3s1/go.mod h1:OI+TtO+Aa3nhQSppMbwE4ld3uF1/fqqwbpfndbbrEe0= +github.com/k3s-io/etcd/raft/v3 v3.5.4-k3s1 h1:wr4FPk1k51wyVmo5WFdU7PppvxgWkhTpVUBklabSHHw= +github.com/k3s-io/etcd/raft/v3 v3.5.4-k3s1/go.mod h1:SCuunjYvZFC0fBX0vxMSPjuZmpcSk+XaAcMrD6Do03w= +github.com/k3s-io/etcd/server/v3 v3.5.4-k3s1 h1:swbvfSDpl7QsYO6Vh+EBgxZCMyG4N1tUgzLPrIjTvVg= +github.com/k3s-io/etcd/server/v3 v3.5.4-k3s1/go.mod h1:S5/YTU15KxymM5l3T6b09sNOHPXqGYIZStpuuGbb65c= github.com/k3s-io/helm-controller v0.12.1 h1:cZgXAreTvz+Aq3DzxL6RB6P1lEAlfDXxOKtwOzrvo+Y= github.com/k3s-io/helm-controller v0.12.1/go.mod h1:yBS3F5emwVjyzUUi3VWAuj9+Ogoq84Mf7CBXbAnKI1U= github.com/k3s-io/kine v0.8.1 h1:cuxZmENBUL5lvJORWGBjn87kKtIo8GK7o8H1hu+vd98= @@ -1196,8 +1201,8 @@ github.com/tchap/go-patricia v2.3.0+incompatible h1:GkY4dP3cEfEASBPPkWd+AmjYxhmD github.com/tchap/go-patricia v2.3.0+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I= github.com/tencentcloud/tencentcloud-sdk-go v1.0.67/go.mod h1:asUz5BPXxgoPGaRgZaVm1iGcUAuHyYUo1nXqKa83cvI= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= -github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= +github.com/tmc/grpc-websocket-proxy v0.0.0-20200427203606-3cfed13b9966/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 h1:uruHq4dN7GR16kFc5fp3d1RIYzJW5onx8Ybykw2YQFA= github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmccombs/hcl2json v0.3.3 h1:+DLNYqpWE0CsOQiEZu+OZm5ZBImake3wtITYxQ8uLFQ= @@ -1256,16 +1261,9 @@ go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU= go.etcd.io/bbolt v1.3.6/go.mod h1:qXsaaIqmgQH0T+OPdb99Bf+PKfBBQVAdyD6TY9G8XM4= -go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= -go.etcd.io/etcd v0.5.0-alpha.5.0.20200910180754-dd1b699fc489 h1:1JFLBqwIgdyHN1ZtgjTBwO+blA6gVOmZurpiMEsETKo= -go.etcd.io/etcd v0.5.0-alpha.5.0.20200910180754-dd1b699fc489/go.mod h1:yVHk9ub3CSBatqGNg7GRmsnfLWtoW60w4eDYfh7vHDg= go.etcd.io/etcd/client/v2 v2.305.0/go.mod h1:h9puh54ZTgAKtEbut2oe9P4L/oqKCVB6xsXlzd7alYQ= -go.etcd.io/etcd/client/v2 v2.305.3 h1:34Yifjgv/6nCPYesKIyh7Tuz9jqFQ1MUut4PV13NmPM= -go.etcd.io/etcd/client/v2 v2.305.3/go.mod h1:RMr4QdniyI8b7LX2IrLNPl9r8tsLUYBrwyxrfNbB6AU= -go.etcd.io/etcd/pkg/v3 v3.5.3 h1:zihRY1c2Q6nQLQSzXDa0hppOphyYyv4ssbO0xEXVJLs= -go.etcd.io/etcd/pkg/v3 v3.5.3/go.mod h1:a1Z9AfwKuEQMXgLfISmIg+50szwz7gmioUZj669wf60= -go.etcd.io/etcd/raft/v3 v3.5.3 h1:QblEh5qpiVJ17jOffHmPlviHzAvOLcgIodRw0ZyAE8s= -go.etcd.io/etcd/raft/v3 v3.5.3/go.mod h1:kCV6hIjK2Oe4UBxDM5dWYs5wZGsiSYH7JvGaEXDlpD4= +go.etcd.io/etcd/client/v2 v2.305.4 h1:Dcx3/MYyfKcPNLpR4VVQUP5KgYrBeJtktBwEKkw08Ao= +go.etcd.io/etcd/client/v2 v2.305.4/go.mod h1:Ud+VUwIi9/uQHOMA+4ekToJ12lTxlv0zB/+DHwTGEbU= go.mongodb.org/mongo-driver v1.0.3/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.mongodb.org/mongo-driver v1.1.1/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= go.mongodb.org/mongo-driver v1.1.2/go.mod h1:u7ryQJ+DOzQmeO7zB6MHyr8jkEQvC8vH7qLUO4lqsUM= @@ -1343,7 +1341,6 @@ golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86h golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= -golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= diff --git a/pkg/agent/run.go b/pkg/agent/run.go index debe812120..92e0950252 100644 --- a/pkg/agent/run.go +++ b/pkg/agent/run.go @@ -208,7 +208,7 @@ func RunStandalone(ctx context.Context, cfg cmds.Agent) error { close(cfg.AgentReady) } - if err := tunnel.Setup(ctx, nodeConfig, proxy); err != nil { + if err := tunnelSetup(ctx, nodeConfig, cfg, proxy); err != nil { return err } @@ -380,7 +380,7 @@ func setupTunnelAndRunAgent(ctx context.Context, nodeConfig *daemonconfig.Node, // IsAPIServerLBEnabled is used as a shortcut for detecting RKE2, where the kubelet needs to // be run earlier in order to manage static pods. This should probably instead query a // flag on the executor or something. - if cfg.ETCDAgent { + if !cfg.ClusterReset && cfg.ETCDAgent { // ETCDAgent is only set to true on servers that are started with --disable-apiserver. // In this case, we may be running without an apiserver available in the cluster, and need // to wait for one to register and post it's address into APIAddressCh so that we can update @@ -404,7 +404,7 @@ func setupTunnelAndRunAgent(ctx context.Context, nodeConfig *daemonconfig.Node, agentRan = true } - if err := tunnel.Setup(ctx, nodeConfig, proxy); err != nil { + if err := tunnelSetup(ctx, nodeConfig, cfg, proxy); err != nil { return err } if !agentRan { @@ -435,3 +435,13 @@ func waitForAPIServerAddresses(ctx context.Context, nodeConfig *daemonconfig.Nod } } } + +// tunnelSetup calls tunnel setup, unless the embedded etc cluster is being reset/restored, in which case +// this is unnecessary as the kubelet is only needed to manage static pods and does not need to establish +// tunneled connections to other cluster members. +func tunnelSetup(ctx context.Context, nodeConfig *daemonconfig.Node, cfg cmds.Agent, proxy proxy.Proxy) error { + if cfg.ClusterReset { + return nil + } + return tunnel.Setup(ctx, nodeConfig, proxy) +} diff --git a/pkg/cli/server/server.go b/pkg/cli/server/server.go index 63c092e846..65faaee472 100644 --- a/pkg/cli/server/server.go +++ b/pkg/cli/server/server.go @@ -400,15 +400,21 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont return errors.Wrap(err, "invalid tls-cipher-suites") } - // make sure components are disabled so we only perform a restore - // and bail out - if cfg.ClusterResetRestorePath != "" && cfg.ClusterReset { + // If performing a cluster reset, make sure control-plane components are + // disabled so we only perform a reset or restore and bail out. + if cfg.ClusterReset { serverConfig.ControlConfig.ClusterInit = true serverConfig.ControlConfig.DisableAPIServer = true serverConfig.ControlConfig.DisableControllerManager = true serverConfig.ControlConfig.DisableScheduler = true serverConfig.ControlConfig.DisableCCM = true + // If the supervisor and apiserver are on the same port, everything is running embedded + // and we don't need the kubelet or containerd up to perform a cluster reset. + if serverConfig.ControlConfig.SupervisorPort == serverConfig.ControlConfig.HTTPSPort { + cfg.DisableAgent = true + } + dataDir, err := datadir.LocalHome(cfg.DataDir, false) if err != nil { return err @@ -417,14 +423,16 @@ func run(app *cli.Context, cfg *cmds.Server, leaderControllers server.CustomCont loadbalancer.ResetLoadBalancer(filepath.Join(dataDir, "agent"), loadbalancer.SupervisorServiceName) loadbalancer.ResetLoadBalancer(filepath.Join(dataDir, "agent"), loadbalancer.APIServerServiceName) - // at this point we're doing a restore. Check to see if we've - // passed in a token and if not, check if the token file exists. - // If it doesn't, return an error indicating the token is necessary. - if cfg.Token == "" { - tokenFile := filepath.Join(dataDir, "server", "token") - if _, err := os.Stat(tokenFile); err != nil { - if os.IsNotExist(err) { - return errors.New(tokenFile + " does not exist, please pass --token to complete the restoration") + if cfg.ClusterResetRestorePath != "" { + // at this point we're doing a restore. Check to see if we've + // passed in a token and if not, check if the token file exists. + // If it doesn't, return an error indicating the token is necessary. + if cfg.Token == "" { + tokenFile := filepath.Join(dataDir, "server", "token") + if _, err := os.Stat(tokenFile); err != nil { + if os.IsNotExist(err) { + return errors.New(tokenFile + " does not exist, please pass --token to complete the restoration") + } } } } diff --git a/pkg/cluster/managed.go b/pkg/cluster/managed.go index c07a946d4b..c47923dafa 100644 --- a/pkg/cluster/managed.go +++ b/pkg/cluster/managed.go @@ -54,36 +54,37 @@ func (c *Cluster) testClusterDB(ctx context.Context) (<-chan struct{}, error) { // start starts the database, unless a cluster reset has been requested, in which case // it does that instead. func (c *Cluster) start(ctx context.Context) error { - resetFile := etcd.ResetFile(c.config) if c.managedDB == nil { return nil } + resetFile := etcd.ResetFile(c.config) + rebootstrap := func() error { + return c.storageBootstrap(ctx) + } - switch { - case c.config.ClusterReset && c.config.ClusterResetRestorePath != "": - rebootstrap := func() error { - return c.storageBootstrap(ctx) + if c.config.ClusterReset { + // If we're restoring from a snapshot, don't check the reset-flag - just reset and restore. + if c.config.ClusterResetRestorePath != "" { + return c.managedDB.Reset(ctx, rebootstrap) } - return c.managedDB.Reset(ctx, rebootstrap) - case c.config.ClusterReset: + // If the reset-flag doesn't exist, reset. This will create the reset-flag if it succeeds. if _, err := os.Stat(resetFile); err != nil { if !os.IsNotExist(err) { return err } - rebootstrap := func() error { - return c.storageBootstrap(ctx) - } return c.managedDB.Reset(ctx, rebootstrap) } - return fmt.Errorf("cluster-reset was successfully performed, please remove the cluster-reset flag and start %s normally, if you need to perform another cluster reset, you must first manually delete the %s file", version.Program, resetFile) + // The reset-flag exists, ask the user to remove it if they want to reset again. + return fmt.Errorf("Managed etcd cluster membership was previously reset, please remove the cluster-reset flag and start %s normally. If you need to perform another cluster reset, you must first manually delete the %s file", version.Program, resetFile) } + // The reset-flag exists but we're not resetting; remove it if _, err := os.Stat(resetFile); err == nil { - // before removing reset file we need to delete the node passwd secret + // Before removing reset file we need to delete the node passwd secret in case the node + // password from the previously restored snapshot differs from the current password on disk. go c.deleteNodePasswdSecret(ctx) + os.Remove(resetFile) } - // removing the reset file and ignore error if the file doesn't exist - os.Remove(resetFile) return c.managedDB.Start(ctx, c.clientAccessInfo) } diff --git a/pkg/daemons/executor/executor.go b/pkg/daemons/executor/executor.go index a84fc25999..1362d85bdb 100644 --- a/pkg/daemons/executor/executor.go +++ b/pkg/daemons/executor/executor.go @@ -35,21 +35,22 @@ type Executor interface { } type ETCDConfig struct { - InitialOptions `json:",inline"` - Name string `json:"name,omitempty"` - ListenClientURLs string `json:"listen-client-urls,omitempty"` - ListenMetricsURLs string `json:"listen-metrics-urls,omitempty"` - ListenPeerURLs string `json:"listen-peer-urls,omitempty"` - AdvertiseClientURLs string `json:"advertise-client-urls,omitempty"` - DataDir string `json:"data-dir,omitempty"` - SnapshotCount int `json:"snapshot-count,omitempty"` - ServerTrust ServerTrust `json:"client-transport-security"` - PeerTrust PeerTrust `json:"peer-transport-security"` - ForceNewCluster bool `json:"force-new-cluster,omitempty"` - HeartbeatInterval int `json:"heartbeat-interval"` - ElectionTimeout int `json:"election-timeout"` - Logger string `json:"logger"` - LogOutputs []string `json:"log-outputs"` + InitialOptions `json:",inline"` + Name string `json:"name,omitempty"` + ListenClientURLs string `json:"listen-client-urls,omitempty"` + ListenMetricsURLs string `json:"listen-metrics-urls,omitempty"` + ListenPeerURLs string `json:"listen-peer-urls,omitempty"` + AdvertiseClientURLs string `json:"advertise-client-urls,omitempty"` + DataDir string `json:"data-dir,omitempty"` + SnapshotCount int `json:"snapshot-count,omitempty"` + ServerTrust ServerTrust `json:"client-transport-security"` + PeerTrust PeerTrust `json:"peer-transport-security"` + ForceNewCluster bool `json:"force-new-cluster,omitempty"` + HeartbeatInterval int `json:"heartbeat-interval"` + ElectionTimeout int `json:"election-timeout"` + Logger string `json:"logger"` + LogOutputs []string `json:"log-outputs"` + ExperimentalInitialCorruptCheck bool `json:"experimental-initial-corrupt-check"` } type ServerTrust struct { diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index ea0057225c..32986336cf 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -93,6 +93,7 @@ type ETCD struct { address string cron *cron.Cron s3 *S3 + cancel context.CancelFunc } type learnerProgress struct { @@ -288,7 +289,11 @@ func (e *ETCD) Reset(ctx context.Context, rebootstrap func() error) error { } if len(members.Members) == 1 && members.Members[0].Name == e.name { - logrus.Infof("Etcd is running, restart without --cluster-reset flag now. Backup and delete ${datadir}/server/db on each peer etcd server and rejoin the nodes") + // Cancel the etcd server context and allow it time to shutdown cleanly. + // Ideally we would use a waitgroup and properly sequence shutdown of the various components. + e.cancel() + time.Sleep(time.Second * 5) + logrus.Infof("Managed etcd cluster membership has been reset, restart without --cluster-reset flag now. Backup and delete ${datadir}/server/db on each peer etcd server and rejoin the nodes") os.Exit(0) } } else { @@ -769,6 +774,7 @@ func (e *ETCD) metricsURL(expose bool) string { // cluster returns ETCDConfig for a cluster func (e *ETCD) cluster(ctx context.Context, forceNew bool, options executor.InitialOptions) error { + ctx, e.cancel = context.WithCancel(ctx) return executor.ETCD(ctx, executor.ETCDConfig{ Name: e.name, InitialOptions: options, @@ -790,10 +796,11 @@ func (e *ETCD) cluster(ctx context.Context, forceNew bool, options executor.Init ClientCertAuth: true, TrustedCAFile: e.config.Runtime.ETCDPeerCA, }, - ElectionTimeout: 5000, - HeartbeatInterval: 500, - Logger: "zap", - LogOutputs: []string{"stderr"}, + ElectionTimeout: 5000, + HeartbeatInterval: 500, + Logger: "zap", + LogOutputs: []string{"stderr"}, + ExperimentalInitialCorruptCheck: true, }, e.config.ExtraEtcdArgs) } @@ -821,18 +828,20 @@ func (e *ETCD) StartEmbeddedTemporary(ctx context.Context) error { } embedded := executor.Embedded{} + ctx, e.cancel = context.WithCancel(ctx) return embedded.ETCD(ctx, executor.ETCDConfig{ - InitialOptions: executor.InitialOptions{AdvertisePeerURL: peerURL}, - DataDir: tmpDataDir, - ForceNewCluster: true, - AdvertiseClientURLs: clientURL, - ListenClientURLs: clientURL, - ListenPeerURLs: peerURL, - Logger: "zap", - HeartbeatInterval: 500, - ElectionTimeout: 5000, - Name: e.name, - LogOutputs: []string{"stderr"}, + InitialOptions: executor.InitialOptions{AdvertisePeerURL: peerURL}, + DataDir: tmpDataDir, + ForceNewCluster: true, + AdvertiseClientURLs: clientURL, + ListenClientURLs: clientURL, + ListenPeerURLs: peerURL, + Logger: "zap", + HeartbeatInterval: 500, + ElectionTimeout: 5000, + Name: e.name, + LogOutputs: []string{"stderr"}, + ExperimentalInitialCorruptCheck: true, }, append(e.config.ExtraAPIArgs, "--max-snapshots=0", "--max-wals=0")) } diff --git a/tests/integration/etcdrestore/etcd_restore_int_test.go b/tests/integration/etcdrestore/etcd_restore_int_test.go index ec6a8daa7e..c106096266 100644 --- a/tests/integration/etcdrestore/etcd_restore_int_test.go +++ b/tests/integration/etcdrestore/etcd_restore_int_test.go @@ -71,7 +71,7 @@ var _ = Describe("etcd snapshot restore", func() { filePath = strings.TrimSuffix(filePath, "\n") Eventually(func() (string, error) { return testutil.K3sCmd("server", "-d", tmpdDataDir, "--cluster-reset", "--token", "test", "--cluster-reset-restore-path", filePath) - }, "360s", "5s").Should(ContainSubstring(`Etcd is running, restart without --cluster-reset flag now`)) + }, "360s", "5s").Should(ContainSubstring(`restart without --cluster-reset flag now`)) }) It("start k3s server", func() { var err error