2020-05-05 22:02:16 +00:00
package etcd
import (
2020-10-27 18:06:26 +00:00
"bytes"
2020-05-05 22:02:16 +00:00
"context"
"crypto/tls"
"encoding/json"
"fmt"
2022-10-08 00:36:57 +00:00
"io/fs"
2022-04-15 00:31:49 +00:00
"net"
2020-05-05 22:02:16 +00:00
"net/http"
"net/url"
"os"
"path/filepath"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
"sort"
"strconv"
2020-05-05 22:02:16 +00:00
"strings"
2024-06-18 23:30:28 +00:00
"sync"
2020-05-05 22:02:16 +00:00
"time"
"github.com/google/uuid"
"github.com/gorilla/mux"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/clientaccess"
2023-09-21 18:53:50 +00:00
"github.com/k3s-io/k3s/pkg/cluster/managed"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/daemons/config"
"github.com/k3s-io/k3s/pkg/daemons/control/deps"
"github.com/k3s-io/k3s/pkg/daemons/executor"
2024-06-11 00:29:17 +00:00
"github.com/k3s-io/k3s/pkg/etcd/s3"
"github.com/k3s-io/k3s/pkg/etcd/snapshot"
2024-03-28 00:48:13 +00:00
"github.com/k3s-io/k3s/pkg/server/auth"
2023-11-13 14:39:24 +00:00
"github.com/k3s-io/k3s/pkg/util"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/version"
2021-04-26 16:47:53 +00:00
"github.com/k3s-io/kine/pkg/client"
endpoint2 "github.com/k3s-io/kine/pkg/endpoint"
2022-02-24 22:35:08 +00:00
cp "github.com/otiai10/copy"
2020-05-05 22:02:16 +00:00
"github.com/pkg/errors"
certutil "github.com/rancher/dynamiclistener/cert"
2024-05-06 16:42:27 +00:00
controllerv1 "github.com/rancher/wrangler/v3/pkg/generated/controllers/core/v1"
2024-04-29 23:29:49 +00:00
"github.com/rancher/wrangler/v3/pkg/start"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
"github.com/robfig/cron/v3"
2020-05-05 22:02:16 +00:00
"github.com/sirupsen/logrus"
2021-07-02 19:55:47 +00:00
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
2024-10-24 00:16:34 +00:00
"go.etcd.io/etcd/client/pkg/v3/logutil"
2021-07-02 19:55:47 +00:00
clientv3 "go.etcd.io/etcd/client/v3"
2024-10-24 00:16:34 +00:00
"go.etcd.io/etcd/client/v3/credentials"
2024-06-11 00:29:17 +00:00
snapshotv3 "go.etcd.io/etcd/etcdutl/v3/snapshot"
2024-10-24 00:16:34 +00:00
"go.etcd.io/etcd/server/v3/etcdserver"
"go.uber.org/zap/zapcore"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/keepalive"
2023-11-13 14:39:24 +00:00
v1 "k8s.io/api/core/v1"
2020-10-27 18:06:26 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2023-11-13 14:39:24 +00:00
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
2020-05-05 22:02:16 +00:00
utilnet "k8s.io/apimachinery/pkg/util/net"
2023-01-10 18:51:39 +00:00
"k8s.io/apimachinery/pkg/util/wait"
2023-12-21 00:23:27 +00:00
"k8s.io/client-go/kubernetes"
2023-11-13 14:39:24 +00:00
nodeHelper "k8s.io/component-helpers/node/util"
nodeUtil "k8s.io/kubernetes/pkg/controller/util/node"
2021-02-12 15:35:57 +00:00
)
2020-10-27 18:06:26 +00:00
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
const (
2024-10-24 00:16:34 +00:00
statusTimeout = time . Second * 30
2021-08-05 20:32:01 +00:00
manageTickerTime = time . Second * 15
learnerMaxStallTime = time . Minute * 5
memberRemovalTimeout = time . Minute * 1
2020-10-27 18:06:26 +00:00
2023-01-10 18:51:39 +00:00
// snapshotJitterMax defines the maximum time skew on cron-triggered snapshots. The actual jitter
// will be a random Duration somewhere between 0 and snapshotJitterMax.
snapshotJitterMax = time . Second * 5
2020-10-27 18:06:26 +00:00
// defaultDialTimeout is intentionally short so that connections timeout within the testTimeout defined above
defaultDialTimeout = 2 * time . Second
// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
defaultKeepAliveTime = 30 * time . Second
defaultKeepAliveTimeout = 10 * time . Second
2024-01-18 05:00:18 +00:00
heartbeatInterval = 5 * time . Minute
2020-12-07 20:30:44 +00:00
2023-09-28 00:28:03 +00:00
maxBackupRetention = 5
2023-12-21 00:23:27 +00:00
etcdStatusType = v1 . NodeConditionType ( "EtcdIsVoter" )
StatusUnjoined MemberStatus = "unjoined"
StatusUnhealthy MemberStatus = "unhealthy"
StatusLearner MemberStatus = "learner"
StatusVoter MemberStatus = "voter"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
)
2021-05-01 01:26:39 +00:00
var (
learnerProgressKey = version . Program + "/etcd/learnerProgress"
// AddressKey will contain the value of api addresses list
AddressKey = version . Program + "/apiaddresses"
2021-09-14 15:20:38 +00:00
NodeNameAnnotation = "etcd." + version . Program + ".cattle.io/node-name"
NodeAddressAnnotation = "etcd." + version . Program + ".cattle.io/node-address"
2022-02-16 22:19:58 +00:00
2024-03-11 22:20:21 +00:00
ErrAddressNotSet = errors . New ( "apiserver addresses not yet set" )
ErrNotMember = errNotMember ( )
ErrMemberListFailed = errMemberListFailed ( )
2021-05-01 01:26:39 +00:00
)
2021-09-14 15:20:38 +00:00
type NodeControllerGetter func ( ) controllerv1 . NodeController
2023-09-21 18:53:50 +00:00
// explicit interface check
var _ managed . Driver = & ETCD { }
2023-12-21 00:23:27 +00:00
type MemberStatus string
2021-05-01 01:26:39 +00:00
type ETCD struct {
2024-06-18 23:30:28 +00:00
client * clientv3 . Client
config * config . Control
name string
address string
cron * cron . Cron
cancel context . CancelFunc
2024-06-11 00:29:17 +00:00
s3 * s3 . Controller
2024-06-18 23:30:28 +00:00
snapshotMu * sync . Mutex
2021-05-01 01:26:39 +00:00
}
type learnerProgress struct {
ID uint64 ` json:"id,omitempty" `
Name string ` json:"name,omitempty" `
RaftAppliedIndex uint64 ` json:"raftAppliedIndex,omitempty" `
LastProgress metav1 . Time ` json:"lastProgress,omitempty" `
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// Members contains a slice that holds all
// members of the cluster.
2020-05-05 22:02:16 +00:00
type Members struct {
Members [ ] * etcdserverpb . Member ` json:"members" `
}
2024-06-11 00:29:17 +00:00
type membershipError struct {
self string
members [ ] string
2022-03-01 23:55:35 +00:00
}
2024-06-11 00:29:17 +00:00
func ( e * membershipError ) Error ( ) string {
return fmt . Sprintf ( "this server is a not a member of the etcd cluster. Found %v, expect: %s" , e . members , e . self )
2022-03-01 23:55:35 +00:00
}
2024-06-11 00:29:17 +00:00
func ( e * membershipError ) Is ( target error ) bool {
2022-03-01 23:55:35 +00:00
switch target {
case ErrNotMember :
return true
}
return false
}
2024-06-11 00:29:17 +00:00
func errNotMember ( ) error { return & membershipError { } }
2022-03-01 23:55:35 +00:00
2024-06-11 00:29:17 +00:00
type memberListError struct {
err error
2024-03-11 22:20:21 +00:00
}
2024-06-11 00:29:17 +00:00
func ( e * memberListError ) Error ( ) string {
return fmt . Sprintf ( "failed to get MemberList from server: %v" , e . err )
2024-03-11 22:20:21 +00:00
}
2024-06-11 00:29:17 +00:00
func ( e * memberListError ) Is ( target error ) bool {
2024-03-11 22:20:21 +00:00
switch target {
case ErrMemberListFailed :
return true
}
return false
}
2024-06-11 00:29:17 +00:00
func errMemberListFailed ( ) error { return & memberListError { } }
2024-03-11 22:20:21 +00:00
2021-05-01 01:26:39 +00:00
// NewETCD creates a new value of type
2024-06-18 23:30:28 +00:00
// ETCD with initialized cron and snapshot mutex values.
2021-05-01 01:26:39 +00:00
func NewETCD ( ) * ETCD {
return & ETCD {
2024-06-18 23:30:28 +00:00
cron : cron . New ( cron . WithLogger ( cronLogger ) ) ,
snapshotMu : & sync . Mutex { } ,
2021-05-01 01:26:39 +00:00
}
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// EndpointName returns the name of the endpoint.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) EndpointName ( ) string {
return "etcd"
}
2023-09-21 18:53:50 +00:00
// SetControlConfig passes the cluster config into the etcd datastore. This is necessary
// because the config may not yet be fully built at the time the Driver instance is registered.
func ( e * ETCD ) SetControlConfig ( config * config . Control ) error {
if e . config != nil {
return errors . New ( "control config already set" )
2022-02-24 22:35:08 +00:00
}
2023-09-21 18:53:50 +00:00
e . config = config
2022-03-10 22:03:02 +00:00
2023-09-21 18:53:50 +00:00
address , err := getAdvertiseAddress ( e . config . PrivateIP )
2022-02-24 22:35:08 +00:00
if err != nil {
return err
}
e . address = address
return e . setName ( false )
2021-05-07 23:10:04 +00:00
}
2022-03-25 18:52:40 +00:00
// Test ensures that the local node is a voting member of the target cluster,
// and that the datastore is defragmented and not in maintenance mode due to alarms.
2020-10-27 18:06:26 +00:00
// If it is still a learner or not a part of the cluster, an error is raised.
2022-03-25 18:52:40 +00:00
// If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised.
2020-10-27 18:06:26 +00:00
func ( e * ETCD ) Test ( ctx context . Context ) error {
2023-09-21 18:53:50 +00:00
if e . config == nil {
return errors . New ( "control config not set" )
}
if e . client == nil {
return errors . New ( "etcd datastore is not started" )
}
2024-10-24 00:16:34 +00:00
status , err := e . status ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2024-10-24 00:16:34 +00:00
return errors . Wrap ( err , "failed to get etcd status" )
} else if status . IsLearner {
return errors . New ( "this server has not yet been promoted from learner to voting member" )
} else if status . Leader == 0 {
return etcdserver . ErrNoLeader
2020-07-29 20:52:49 +00:00
}
2024-10-24 00:16:34 +00:00
logrus . Infof ( "Connected to etcd v%s - datastore using %d of %d bytes" , status . Version , status . DbSizeInUse , status . DbSize )
if len ( status . Errors ) > 0 {
logrus . Warnf ( "Errors present on etcd cluster: %s" , strings . Join ( status . Errors , "," ) )
2020-07-29 20:52:49 +00:00
}
2020-10-27 18:06:26 +00:00
2024-10-24 00:16:34 +00:00
// defrag this node to reclaim freed space from compacted revisions
2022-03-25 18:52:40 +00:00
if err := e . defragment ( ctx ) ; err != nil {
return errors . Wrap ( err , "failed to defragment etcd database" )
}
2024-10-24 00:16:34 +00:00
// clear alarms on this node
if err := e . clearAlarms ( ctx , status . Header . MemberId ) ; err != nil {
return errors . Wrap ( err , "failed to disarm etcd alarms" )
2022-02-23 21:52:46 +00:00
}
2024-10-24 00:16:34 +00:00
// refresh status - note that errors may remain on other nodes, but this
// should not prevent us from continuing with startup.
status , err = e . status ( ctx )
2022-03-25 18:52:40 +00:00
if err != nil {
2024-10-24 00:16:34 +00:00
return errors . Wrap ( err , "failed to get etcd status" )
2022-03-25 18:52:40 +00:00
}
2024-10-24 00:16:34 +00:00
logrus . Infof ( "Datastore using %d of %d bytes after defragment" , status . DbSizeInUse , status . DbSize )
2022-03-25 18:52:40 +00:00
if len ( status . Errors ) > 0 {
2024-10-24 00:16:34 +00:00
logrus . Warnf ( "Errors present on etcd cluster after defragment: %s" , strings . Join ( status . Errors , "," ) )
2022-03-25 18:52:40 +00:00
}
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return err
}
2024-10-24 00:16:34 +00:00
// Ensure that there is a cluster member with our peerURL and name
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
var memberNameUrls [ ] string
2020-05-05 22:02:16 +00:00
for _ , member := range members . Members {
for _ , peerURL := range member . PeerURLs {
if peerURL == e . peerURL ( ) && e . name == member . Name {
return nil
}
}
if len ( member . PeerURLs ) > 0 {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
memberNameUrls = append ( memberNameUrls , member . Name + "=" + member . PeerURLs [ 0 ] )
2020-05-05 22:02:16 +00:00
}
}
2024-10-24 00:16:34 +00:00
// no matching PeerURL on any Member, return an error that indicates what was expected vs what we found.
2024-06-11 00:29:17 +00:00
return & membershipError { members : memberNameUrls , self : e . name + "=" + e . peerURL ( ) }
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:48:21 +00:00
// dbDir returns the path to dataDir/db/etcd
func dbDir ( config * config . Control ) string {
2020-09-24 05:59:58 +00:00
return filepath . Join ( config . DataDir , "db" , "etcd" )
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:53:50 +00:00
// walDir returns the path to etcdDBDir/member/wal
2020-09-24 05:59:58 +00:00
func walDir ( config * config . Control ) string {
2023-09-21 18:48:21 +00:00
return filepath . Join ( dbDir ( config ) , "member" , "wal" )
2020-05-05 22:02:16 +00:00
}
2021-04-26 16:47:53 +00:00
func sqliteFile ( config * config . Control ) string {
return filepath . Join ( config . DataDir , "db" , "state.db" )
}
2023-09-21 18:53:50 +00:00
// nameFile returns the path to etcdDBDir/name.
2020-05-05 22:02:16 +00:00
func nameFile ( config * config . Control ) string {
2023-09-21 18:48:21 +00:00
return filepath . Join ( dbDir ( config ) , "name" )
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:53:50 +00:00
// clearReset removes the reset file
func ( e * ETCD ) clearReset ( ) error {
if err := os . Remove ( e . ResetFile ( ) ) ; err != nil && ! os . IsNotExist ( err ) {
return err
}
return nil
}
// IsReset checks to see if the reset file exists, indicating that a cluster-reset has been completed successfully.
func ( e * ETCD ) IsReset ( ) ( bool , error ) {
if e . config == nil {
return false , errors . New ( "control config not set" )
}
if _ , err := os . Stat ( e . ResetFile ( ) ) ; err != nil {
if ! os . IsNotExist ( err ) {
return false , err
}
return false , nil
}
return true , nil
}
// ResetFile returns the path to etcdDBDir/reset-flag.
func ( e * ETCD ) ResetFile ( ) string {
if e . config == nil {
panic ( "control config not set" )
}
return filepath . Join ( e . config . DataDir , "db" , "reset-flag" )
2020-09-30 00:53:31 +00:00
}
2020-09-22 03:23:18 +00:00
// IsInitialized checks to see if a WAL directory exists. If so, we assume that etcd
// has already been brought up at least once.
2023-09-21 18:53:50 +00:00
func ( e * ETCD ) IsInitialized ( ) ( bool , error ) {
if e . config == nil {
return false , errors . New ( "control config not set" )
}
dir := walDir ( e . config )
2020-09-22 03:23:18 +00:00
if s , err := os . Stat ( dir ) ; err == nil && s . IsDir ( ) {
2020-05-05 22:02:16 +00:00
return true , nil
} else if os . IsNotExist ( err ) {
return false , nil
} else {
2021-10-07 19:47:00 +00:00
return false , errors . Wrap ( err , "invalid state for wal directory " + dir )
2020-05-05 22:02:16 +00:00
}
}
2021-11-10 12:33:42 +00:00
// Reset resets an etcd node to a single node cluster.
2021-03-11 20:07:40 +00:00
func ( e * ETCD ) Reset ( ctx context . Context , rebootstrap func ( ) error ) error {
2020-09-22 03:23:18 +00:00
// Wait for etcd to come up as a new single-node cluster, then exit
2020-05-05 22:02:16 +00:00
go func ( ) {
2024-02-21 18:26:13 +00:00
<- e . config . Runtime . ContainerRuntimeReady
2020-07-29 20:52:49 +00:00
t := time . NewTicker ( 5 * time . Second )
defer t . Stop ( )
for range t . C {
2020-10-27 18:06:26 +00:00
if err := e . Test ( ctx ) ; err == nil {
2023-09-21 18:53:50 +00:00
// reset the apiaddresses to nil since we are doing a restoration
if _ , err := e . client . Put ( ctx , AddressKey , "" ) ; err != nil {
logrus . Warnf ( "failed to reset api addresses key in etcd: %v" , err )
continue
}
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
continue
}
2021-12-09 21:54:27 +00:00
if rebootstrap != nil {
// storageBootstrap() - runtime structure has been written with correct certificate data
if err := rebootstrap ( ) ; err != nil {
logrus . Fatal ( err )
}
2021-03-03 18:14:12 +00:00
}
// call functions to rewrite them from daemons/control/server.go (prepare())
2022-02-24 19:01:14 +00:00
if err := deps . GenServerDeps ( e . config ) ; err != nil {
2021-03-03 18:14:12 +00:00
logrus . Fatal ( err )
}
2020-05-05 22:02:16 +00:00
if len ( members . Members ) == 1 && members . Members [ 0 ] . Name == e . name {
2022-04-27 20:44:15 +00:00
// Cancel the etcd server context and allow it time to shutdown cleanly.
// Ideally we would use a waitgroup and properly sequence shutdown of the various components.
e . cancel ( )
time . Sleep ( time . Second * 5 )
logrus . Infof ( "Managed etcd cluster membership has been reset, restart without --cluster-reset flag now. Backup and delete ${datadir}/server/db on each peer etcd server and rejoin the nodes" )
2020-05-05 22:02:16 +00:00
os . Exit ( 0 )
}
2021-10-22 22:25:29 +00:00
} else {
// make sure that peer ips are updated to the node ip in case the test fails
members , err := e . client . MemberList ( ctx )
if err != nil {
logrus . Warnf ( "failed to list etcd members: %v" , err )
continue
}
if len ( members . Members ) > 1 {
logrus . Warnf ( "failed to update peer url: etcd still has more than one member" )
continue
}
if _ , err := e . client . MemberUpdate ( ctx , members . Members [ 0 ] . ID , [ ] string { e . peerURL ( ) } ) ; err != nil {
logrus . Warnf ( "failed to update peer url: %v" , err )
continue
}
2020-05-05 22:02:16 +00:00
}
}
} ( )
2020-09-22 03:23:18 +00:00
2023-09-21 18:53:50 +00:00
if err := e . startClient ( ctx ) ; err != nil {
return err
}
2020-09-22 03:23:18 +00:00
// If asked to restore from a snapshot, do so
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if e . config . ClusterResetRestorePath != "" {
2024-06-11 00:29:17 +00:00
if e . config . EtcdS3 != nil {
2023-10-27 18:38:00 +00:00
logrus . Infof ( "Retrieving etcd snapshot %s from S3" , e . config . ClusterResetRestorePath )
2024-06-11 00:29:17 +00:00
s3client , err := e . getS3Client ( ctx )
if err != nil {
if errors . Is ( err , s3 . ErrNoConfigSecret ) {
return errors . New ( "cannot use S3 config secret when restoring snapshot; configuration must be set in CLI or config file" )
} else {
return errors . Wrap ( err , "failed to initialize S3 client" )
}
2021-03-03 18:14:12 +00:00
}
2024-06-11 00:29:17 +00:00
dir , err := snapshotDir ( e . config , true )
if err != nil {
return errors . Wrap ( err , "failed to get the snapshot dir" )
2021-03-03 18:14:12 +00:00
}
2024-06-11 00:29:17 +00:00
path , err := s3client . Download ( ctx , e . config . ClusterResetRestorePath , dir )
if err != nil {
return errors . Wrap ( err , "failed to download snapshot from S3" )
}
e . config . ClusterResetRestorePath = path
2021-03-03 18:14:12 +00:00
logrus . Infof ( "S3 download complete for %s" , e . config . ClusterResetRestorePath )
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
info , err := os . Stat ( e . config . ClusterResetRestorePath )
if os . IsNotExist ( err ) {
return fmt . Errorf ( "etcd: snapshot path does not exist: %s" , e . config . ClusterResetRestorePath )
}
if info . IsDir ( ) {
2020-09-22 03:23:18 +00:00
return fmt . Errorf ( "etcd: snapshot path must be a file, not a directory: %s" , e . config . ClusterResetRestorePath )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-19 01:09:36 +00:00
if err := e . Restore ( ctx ) ; err != nil {
return err
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-22 03:23:18 +00:00
2020-09-19 01:09:36 +00:00
if err := e . setName ( true ) ; err != nil {
return err
}
2020-09-30 00:53:31 +00:00
// touch a file to avoid multiple resets
2023-09-21 18:53:50 +00:00
if err := os . WriteFile ( e . ResetFile ( ) , [ ] byte { } , 0600 ) ; err != nil {
2020-09-30 00:53:31 +00:00
return err
}
2024-01-16 22:43:08 +00:00
2020-05-05 22:02:16 +00:00
return e . newCluster ( ctx , true )
}
2020-09-22 03:23:18 +00:00
// Start starts the datastore
2020-07-29 20:52:49 +00:00
func ( e * ETCD ) Start ( ctx context . Context , clientAccessInfo * clientaccess . Info ) error {
2023-09-21 18:53:50 +00:00
isInitialized , err := e . IsInitialized ( )
2020-05-05 22:02:16 +00:00
if err != nil {
2023-09-21 18:53:50 +00:00
return errors . Wrapf ( err , "failed to check for initialized etcd datastore" )
}
if err := e . startClient ( ctx ) ; err != nil {
return err
2020-05-05 22:02:16 +00:00
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if ! e . config . EtcdDisableSnapshots {
e . setSnapshotFunction ( ctx )
e . cron . Start ( )
}
2020-10-27 18:06:26 +00:00
go e . manageLearners ( ctx )
2024-06-11 00:29:17 +00:00
go e . getS3Client ( ctx )
2020-10-27 18:06:26 +00:00
2022-02-16 22:19:58 +00:00
if isInitialized {
2024-02-22 22:37:32 +00:00
// check etcd dir permission
2023-09-21 18:48:21 +00:00
etcdDir := dbDir ( e . config )
2021-01-06 17:05:49 +00:00
info , err := os . Stat ( etcdDir )
if err != nil {
return err
}
if info . Mode ( ) != 0700 {
if err := os . Chmod ( etcdDir , 0700 ) ; err != nil {
return err
}
}
2020-05-05 22:02:16 +00:00
opt , err := executor . CurrentETCDOptions ( )
if err != nil {
return err
}
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd for existing cluster member" )
2020-05-05 22:02:16 +00:00
return e . cluster ( ctx , false , opt )
}
2020-07-29 20:52:49 +00:00
if clientAccessInfo == nil {
2020-05-05 22:02:16 +00:00
return e . newCluster ( ctx , false )
}
2021-05-01 01:26:39 +00:00
2021-10-12 06:13:10 +00:00
go func ( ) {
2022-02-16 22:19:58 +00:00
for {
select {
case <- time . After ( 30 * time . Second ) :
2024-02-21 18:26:13 +00:00
logrus . Infof ( "Waiting for container runtime to become ready before joining etcd cluster" )
case <- e . config . Runtime . ContainerRuntimeReady :
2024-07-18 19:45:19 +00:00
if err := wait . PollUntilContextCancel ( ctx , time . Second , true , func ( ctx context . Context ) ( bool , error ) {
2023-04-25 22:35:22 +00:00
if err := e . join ( ctx , clientAccessInfo ) ; err != nil {
// Retry the join if waiting for another member to be promoted, or waiting for peers to connect after promotion
2023-04-27 21:13:39 +00:00
if errors . Is ( err , rpctypes . ErrTooManyLearners ) || errors . Is ( err , rpctypes . ErrUnhealthy ) {
logrus . Infof ( "Waiting for other members to finish joining etcd cluster: %v" , err )
2023-04-25 22:35:22 +00:00
return false , nil
}
2024-03-11 22:20:21 +00:00
// Retry the join if waiting to retrieve the member list from the server
if errors . Is ( err , ErrMemberListFailed ) {
logrus . Infof ( "Waiting to retrieve etcd cluster member list: %v" , err )
return false , nil
}
2023-04-25 22:35:22 +00:00
return false , err
}
return true , nil
} ) ; err != nil {
logrus . Fatalf ( "etcd cluster join failed: %v" , err )
2022-02-16 22:19:58 +00:00
}
return
case <- ctx . Done ( ) :
return
}
2021-10-12 06:13:10 +00:00
}
} ( )
return nil
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:53:50 +00:00
// startClient sets up the config's datastore endpoints, and starts an etcd client connected to the server endpoint.
// The client is destroyed when the context is closed.
func ( e * ETCD ) startClient ( ctx context . Context ) error {
if e . client != nil {
return errors . New ( "etcd datastore already started" )
}
endpoints := getEndpoints ( e . config )
e . config . Datastore . Endpoint = endpoints [ 0 ]
e . config . Datastore . BackendTLSConfig . CAFile = e . config . Runtime . ETCDServerCA
e . config . Datastore . BackendTLSConfig . CertFile = e . config . Runtime . ClientETCDCert
e . config . Datastore . BackendTLSConfig . KeyFile = e . config . Runtime . ClientETCDKey
2024-10-24 00:16:34 +00:00
client , conn , err := getClient ( ctx , e . config , endpoints ... )
2023-09-21 18:53:50 +00:00
if err != nil {
return err
}
e . client = client
go func ( ) {
<- ctx . Done ( )
e . client = nil
2024-10-24 00:16:34 +00:00
conn . Close ( )
2023-09-21 18:53:50 +00:00
} ( )
return nil
}
2020-09-22 03:23:18 +00:00
// join attempts to add a member to an existing cluster
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) join ( ctx context . Context , clientAccessInfo * clientaccess . Info ) error {
2021-09-15 05:13:31 +00:00
clientCtx , cancel := context . WithTimeout ( ctx , 20 * time . Second )
defer cancel ( )
var (
cluster [ ] string
add = true
)
clientURLs , memberList , err := ClientURLs ( clientCtx , clientAccessInfo , e . config . PrivateIP )
2020-05-05 22:02:16 +00:00
if err != nil {
return err
}
2024-10-24 00:16:34 +00:00
client , conn , err := getClient ( clientCtx , e . config , clientURLs ... )
2020-05-05 22:02:16 +00:00
if err != nil {
return err
}
2024-10-24 00:16:34 +00:00
defer conn . Close ( )
2020-05-05 22:02:16 +00:00
2023-04-27 21:13:39 +00:00
for _ , member := range memberList . Members {
2020-05-05 22:02:16 +00:00
for _ , peer := range member . PeerURLs {
u , err := url . Parse ( peer )
if err != nil {
return err
}
2021-12-18 06:26:04 +00:00
// An uninitialized joining member won't have a name; if it has our
// address it must be us.
2020-05-05 22:02:16 +00:00
if member . Name == "" && u . Hostname ( ) == e . address {
member . Name = e . name
}
2021-12-18 06:26:04 +00:00
// If we're already in the cluster, don't try to add ourselves.
if member . Name == e . name && u . Hostname ( ) == e . address {
add = false
}
2020-05-05 22:02:16 +00:00
if len ( member . PeerURLs ) > 0 {
cluster = append ( cluster , fmt . Sprintf ( "%s=%s" , member . Name , member . PeerURLs [ 0 ] ) )
}
}
2021-12-18 06:26:04 +00:00
// Try to get the node name from the member name
memberNodeName := member . Name
if lastHyphen := strings . LastIndex ( member . Name , "-" ) ; lastHyphen > 1 {
memberNodeName = member . Name [ : lastHyphen ]
}
// Make sure there's not already a member in the cluster with a duplicate node name
if member . Name != e . name && memberNodeName == e . config . ServerNodeName {
// make sure to remove the name file if a duplicate node name is used, so that we
// generate a new member name when our node name is fixed.
nameFile := nameFile ( e . config )
if err := os . Remove ( nameFile ) ; err != nil {
logrus . Errorf ( "Failed to remove etcd name file %s: %v" , nameFile , err )
}
return errors . New ( "duplicate node name found, please use a unique name for this node" )
}
2020-05-05 22:02:16 +00:00
}
if add {
2021-10-12 06:13:10 +00:00
logrus . Infof ( "Adding member %s=%s to etcd cluster %v" , e . name , e . peerURL ( ) , cluster )
2021-09-15 05:13:31 +00:00
if _ , err = client . MemberAddAsLearner ( clientCtx , [ ] string { e . peerURL ( ) } ) ; err != nil {
2020-05-05 22:02:16 +00:00
return err
}
cluster = append ( cluster , fmt . Sprintf ( "%s=%s" , e . name , e . peerURL ( ) ) )
}
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd to join cluster with members %v" , cluster )
2020-05-05 22:02:16 +00:00
return e . cluster ( ctx , false , executor . InitialOptions {
Cluster : strings . Join ( cluster , "," ) ,
State : "existing" ,
} )
}
2023-09-21 18:53:50 +00:00
// Register adds db info routes for the http request handler, and registers cluster controller callbacks
func ( e * ETCD ) Register ( handler http . Handler ) ( http . Handler , error ) {
2023-02-08 00:37:10 +00:00
e . config . Runtime . ClusterControllerStarts [ "etcd-node-metadata" ] = func ( ctx context . Context ) {
registerMetadataHandlers ( ctx , e )
2021-09-14 15:20:38 +00:00
}
2023-02-08 00:37:10 +00:00
// The apiserver endpoint controller needs to run on a node with a local apiserver,
2023-02-13 20:00:52 +00:00
// in order to successfully seed etcd with the endpoint list. The member removal controller
// also needs to run on a non-etcd node as to avoid disruption if running on the node that
// is being removed from the cluster.
2023-02-08 00:37:10 +00:00
if ! e . config . DisableAPIServer {
2023-02-13 20:00:52 +00:00
e . config . Runtime . LeaderElectedClusterControllerStarts [ version . Program + "-etcd" ] = func ( ctx context . Context ) {
2023-02-08 00:37:10 +00:00
registerEndpointsHandlers ( ctx , e )
2023-02-13 20:00:52 +00:00
registerMemberHandlers ( ctx , e )
2023-10-03 17:13:26 +00:00
registerSnapshotHandlers ( ctx , e )
2024-04-29 23:29:49 +00:00
// Re-run informer factory startup after core and leader-elected controllers have started.
// Additional caches may need to start for the newly added OnChange/OnRemove callbacks.
if err := start . All ( ctx , 5 , e . config . Runtime . K3s , e . config . Runtime . Core ) ; err != nil {
panic ( errors . Wrap ( err , "failed to start wrangler controllers" ) )
}
2023-02-08 00:37:10 +00:00
}
2020-05-05 22:02:16 +00:00
}
2022-04-06 00:11:24 +00:00
2023-02-13 20:00:52 +00:00
// Tombstone file checking is unnecessary if we're not running etcd.
2023-02-08 00:37:10 +00:00
if ! e . config . DisableETCD {
2023-09-21 18:48:21 +00:00
tombstoneFile := filepath . Join ( dbDir ( e . config ) , "tombstone" )
2023-02-08 00:37:10 +00:00
if _ , err := os . Stat ( tombstoneFile ) ; err == nil {
logrus . Infof ( "tombstone file has been detected, removing data dir to rejoin the cluster" )
2023-09-21 18:48:21 +00:00
if _ , err := backupDirWithRetention ( dbDir ( e . config ) , maxBackupRetention ) ; err != nil {
2023-02-08 00:37:10 +00:00
return nil , err
}
}
if err := e . setName ( false ) ; err != nil {
return nil , err
}
2020-12-07 20:30:44 +00:00
}
2021-09-14 15:20:38 +00:00
2023-02-08 00:37:10 +00:00
return e . handler ( handler ) , nil
2020-05-05 22:02:16 +00:00
}
2020-09-22 03:23:18 +00:00
// setName sets a unique name for this cluster member. The first time this is called,
// or if force is set to true, a new name will be generated and written to disk. The persistent
// name is used on subsequent calls.
2020-09-19 01:09:36 +00:00
func ( e * ETCD ) setName ( force bool ) error {
2020-05-05 22:02:16 +00:00
fileName := nameFile ( e . config )
2022-10-08 00:36:57 +00:00
data , err := os . ReadFile ( fileName )
2020-09-19 01:09:36 +00:00
if os . IsNotExist ( err ) || force {
2024-02-16 18:13:33 +00:00
if e . config . ServerNodeName == "" {
return errors . New ( "server node name not set" )
}
2021-09-17 22:51:18 +00:00
e . name = e . config . ServerNodeName + "-" + uuid . New ( ) . String ( ) [ : 8 ]
2020-09-24 06:01:35 +00:00
if err := os . MkdirAll ( filepath . Dir ( fileName ) , 0700 ) ; err != nil {
2020-05-05 22:02:16 +00:00
return err
}
2022-10-08 00:36:57 +00:00
return os . WriteFile ( fileName , [ ] byte ( e . name ) , 0600 )
2020-05-05 22:02:16 +00:00
} else if err != nil {
return err
}
e . name = string ( data )
return nil
}
2020-09-24 06:29:25 +00:00
// handler wraps the handler with routes for database info
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) handler ( next http . Handler ) http . Handler {
2024-04-22 20:23:34 +00:00
r := mux . NewRouter ( ) . SkipClean ( true )
r . NotFoundHandler = next
ir := r . Path ( "/db/info" ) . Subrouter ( )
ir . Use ( auth . IsLocalOrHasRole ( e . config , version . Program + ":server" ) )
ir . Handle ( "" , e . infoHandler ( ) )
sr := r . Path ( "/db/snapshot" ) . Subrouter ( )
sr . Use ( auth . HasRole ( e . config , version . Program + ":server" ) )
sr . Handle ( "" , e . snapshotHandler ( ) )
return r
2020-05-05 22:02:16 +00:00
}
2021-10-12 06:13:10 +00:00
// infoHandler returns etcd cluster information. This is used by new members when joining the cluster.
2023-04-27 21:13:39 +00:00
// If we can't retrieve an actual MemberList from etcd, we return a canned response with only the local node listed.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) infoHandler ( ) http . Handler {
return http . HandlerFunc ( func ( rw http . ResponseWriter , req * http . Request ) {
2024-03-28 00:48:13 +00:00
if req . Method != http . MethodGet {
util . SendError ( fmt . Errorf ( "method not allowed" ) , rw , req , http . StatusMethodNotAllowed )
return
}
2020-05-05 22:02:16 +00:00
ctx , cancel := context . WithTimeout ( req . Context ( ) , 2 * time . Second )
defer cancel ( )
members , err := e . client . MemberList ( ctx )
if err != nil {
2024-03-11 22:20:21 +00:00
util . SendError ( errors . Wrap ( err , "failed to get etcd MemberList" ) , rw , req , http . StatusInternalServerError )
return
2020-05-05 22:02:16 +00:00
}
rw . Header ( ) . Set ( "Content-Type" , "application/json" )
json . NewEncoder ( rw ) . Encode ( & Members {
Members : members . Members ,
} )
} )
}
2023-09-21 18:48:21 +00:00
// getClient returns an etcd client connected to the specified endpoints.
2022-02-24 22:35:08 +00:00
// If no endpoints are provided, endpoints are retrieved from the provided runtime config.
// If the runtime config does not list any endpoints, the default endpoint is used.
// The returned client should be closed when no longer needed, in order to avoid leaking GRPC
// client goroutines.
2024-10-24 00:16:34 +00:00
func getClient ( ctx context . Context , control * config . Control , endpoints ... string ) ( * clientv3 . Client , * grpc . ClientConn , error ) {
logger , err := logutil . CreateDefaultZapLogger ( zapcore . DebugLevel )
if err != nil {
return nil , nil , err
}
2022-04-12 16:59:47 +00:00
cfg , err := getClientConfig ( ctx , control , endpoints ... )
2020-05-05 22:02:16 +00:00
if err != nil {
2024-10-24 00:16:34 +00:00
return nil , nil , err
}
// Set up dialer and resolver options.
// This is normally handled by clientv3.New() but that wraps all the GRPC
// service with retry handlers and uses deprecated grpc.DialContext() which
// tries to establish a connection even when one isn't wanted.
if cfg . DialKeepAliveTime > 0 {
params := keepalive . ClientParameters {
Time : cfg . DialKeepAliveTime ,
Timeout : cfg . DialKeepAliveTimeout ,
PermitWithoutStream : cfg . PermitWithoutStream ,
}
cfg . DialOptions = append ( cfg . DialOptions , grpc . WithKeepaliveParams ( params ) )
}
if cfg . TLS != nil {
creds := credentials . NewBundle ( credentials . Config { TLSConfig : cfg . TLS } ) . TransportCredentials ( )
cfg . DialOptions = append ( cfg . DialOptions , grpc . WithTransportCredentials ( creds ) )
} else {
cfg . DialOptions = append ( cfg . DialOptions , grpc . WithTransportCredentials ( insecure . NewCredentials ( ) ) )
2020-05-05 22:02:16 +00:00
}
2021-11-10 12:33:42 +00:00
2024-10-24 00:16:34 +00:00
cfg . DialOptions = append ( cfg . DialOptions , grpc . WithResolvers ( NewSimpleResolver ( cfg . Endpoints [ 0 ] ) ) )
target := fmt . Sprintf ( "%s://%p/%s" , scheme , cfg , authority ( cfg . Endpoints [ 0 ] ) )
conn , err := grpc . NewClient ( target , cfg . DialOptions ... )
if err != nil {
return nil , nil , err
}
// Create a new client and wire up the GRPC service interfaces.
// Ref: https://github.com/etcd-io/etcd/blob/v3.5.16/client/v3/client.go#L87
client := clientv3 . NewCtxClient ( ctx , clientv3 . WithZapLogger ( logger . Named ( version . Program + "-etcd-client" ) ) )
client . Cluster = clientv3 . NewClusterFromClusterClient ( etcdserverpb . NewClusterClient ( conn ) , client )
client . KV = clientv3 . NewKVFromKVClient ( etcdserverpb . NewKVClient ( conn ) , client )
client . Maintenance = clientv3 . NewMaintenanceFromMaintenanceClient ( etcdserverpb . NewMaintenanceClient ( conn ) , client )
return client , conn , nil
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
// getClientConfig generates an etcd client config connected to the specified endpoints.
// If no endpoints are provided, getEndpoints is called to provide defaults.
2022-04-12 16:59:47 +00:00
func getClientConfig ( ctx context . Context , control * config . Control , endpoints ... string ) ( * clientv3 . Config , error ) {
runtime := control . Runtime
2022-02-24 22:35:08 +00:00
if len ( endpoints ) == 0 {
2022-04-12 16:59:47 +00:00
endpoints = getEndpoints ( control )
2022-02-24 22:35:08 +00:00
}
2022-03-29 18:45:21 +00:00
config := & clientv3 . Config {
2020-09-22 03:23:18 +00:00
Endpoints : endpoints ,
Context : ctx ,
DialTimeout : defaultDialTimeout ,
DialKeepAliveTime : defaultKeepAliveTime ,
2020-10-27 18:06:26 +00:00
DialKeepAliveTimeout : defaultKeepAliveTimeout ,
2023-02-13 20:00:52 +00:00
PermitWithoutStream : true ,
2022-03-29 18:45:21 +00:00
}
var err error
if strings . HasPrefix ( endpoints [ 0 ] , "https://" ) {
config . TLS , err = toTLSConfig ( runtime )
}
return config , err
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
// getEndpoints returns the endpoints from the runtime config if set, otherwise the default endpoint.
2022-04-12 16:59:47 +00:00
func getEndpoints ( control * config . Control ) [ ] string {
runtime := control . Runtime
2022-02-24 22:35:08 +00:00
if len ( runtime . EtcdConfig . Endpoints ) > 0 {
return runtime . EtcdConfig . Endpoints
}
2022-07-21 21:40:09 +00:00
return [ ] string { fmt . Sprintf ( "https://%s:2379" , control . Loopback ( true ) ) }
2022-02-24 22:35:08 +00:00
}
2020-09-22 03:23:18 +00:00
// toTLSConfig converts the ControlRuntime configuration to TLS configuration suitable
// for use by etcd.
2020-05-05 22:02:16 +00:00
func toTLSConfig ( runtime * config . ControlRuntime ) ( * tls . Config , error ) {
2021-10-12 06:13:10 +00:00
if runtime . ClientETCDCert == "" || runtime . ClientETCDKey == "" || runtime . ETCDServerCA == "" {
2024-04-25 23:49:47 +00:00
return nil , util . ErrCoreNotReady
2021-10-12 06:13:10 +00:00
}
2020-05-05 22:02:16 +00:00
clientCert , err := tls . LoadX509KeyPair ( runtime . ClientETCDCert , runtime . ClientETCDKey )
if err != nil {
return nil , err
}
pool , err := certutil . NewPool ( runtime . ETCDServerCA )
if err != nil {
return nil , err
}
return & tls . Config {
RootCAs : pool ,
Certificates : [ ] tls . Certificate { clientCert } ,
} , nil
}
2020-09-22 03:23:18 +00:00
// getAdvertiseAddress returns the IP address best suited for advertising to clients
2023-03-24 22:19:44 +00:00
func getAdvertiseAddress ( advertiseIP string ) ( string , error ) {
2020-05-05 22:02:16 +00:00
ip := advertiseIP
if ip == "" {
ipAddr , err := utilnet . ChooseHostInterface ( )
if err != nil {
return "" , err
}
ip = ipAddr . String ( )
}
return ip , nil
}
2020-09-22 03:23:18 +00:00
// newCluster returns options to set up etcd for a new cluster
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) newCluster ( ctx context . Context , reset bool ) error {
2024-01-16 22:43:08 +00:00
logrus . Infof ( "Starting etcd for new cluster, cluster-reset=%v" , reset )
2021-04-26 16:47:53 +00:00
err := e . cluster ( ctx , reset , executor . InitialOptions {
2021-10-12 06:13:10 +00:00
AdvertisePeerURL : e . peerURL ( ) ,
Cluster : fmt . Sprintf ( "%s=%s" , e . name , e . peerURL ( ) ) ,
2020-05-05 22:02:16 +00:00
State : "new" ,
} )
2021-04-26 16:47:53 +00:00
if err != nil {
return err
}
2024-01-16 22:43:08 +00:00
if ! reset {
if err := e . migrateFromSQLite ( ctx ) ; err != nil {
return fmt . Errorf ( "failed to migrate content from sqlite to etcd: %w" , err )
}
2021-04-26 16:47:53 +00:00
}
return nil
}
func ( e * ETCD ) migrateFromSQLite ( ctx context . Context ) error {
_ , err := os . Stat ( sqliteFile ( e . config ) )
if os . IsNotExist ( err ) {
return nil
} else if err != nil {
return err
}
logrus . Infof ( "Migrating content from sqlite to etcd" )
ctx , cancel := context . WithCancel ( ctx )
defer cancel ( )
_ , err = endpoint2 . Listen ( ctx , endpoint2 . Config {
2024-10-02 22:34:32 +00:00
Endpoint : "sqlite://" ,
2021-04-26 16:47:53 +00:00
} )
if err != nil {
return err
}
sqliteClient , err := client . New ( endpoint2 . ETCDConfig {
Endpoints : [ ] string { "unix://kine.sock" } ,
} )
if err != nil {
return err
}
defer sqliteClient . Close ( )
2024-10-24 00:16:34 +00:00
etcdClient , conn , err := getClient ( ctx , e . config )
2021-04-26 16:47:53 +00:00
if err != nil {
return err
}
2024-10-24 00:16:34 +00:00
defer conn . Close ( )
2021-04-26 16:47:53 +00:00
values , err := sqliteClient . List ( ctx , "/registry/" , 0 )
if err != nil {
return err
}
for _ , value := range values {
logrus . Infof ( "Migrating etcd key %s" , value . Key )
_ , err := etcdClient . Put ( ctx , string ( value . Key ) , string ( value . Data ) )
if err != nil {
return err
}
}
return os . Rename ( sqliteFile ( e . config ) , sqliteFile ( e . config ) + ".migrated" )
2020-05-05 22:02:16 +00:00
}
2022-05-05 08:10:08 +00:00
// peerURL returns the external peer access address for the local node.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) peerURL ( ) string {
2022-04-15 00:31:49 +00:00
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . address , "2380" ) )
2020-05-05 22:02:16 +00:00
}
2022-05-05 08:10:08 +00:00
// listenClientURLs returns a list of URLs to bind to for peer connections.
// During cluster reset/restore, we only listen on loopback to avoid having peers
// connect mid-process.
func ( e * ETCD ) listenPeerURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
peerURLs := fmt . Sprintf ( "https://%s:2380" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset {
peerURLs += "," + e . peerURL ( )
}
return peerURLs
}
// clientURL returns the external client access address for the local node.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) clientURL ( ) string {
2022-04-15 00:31:49 +00:00
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . address , "2379" ) )
2020-05-05 22:02:16 +00:00
}
2023-03-24 22:19:44 +00:00
// advertiseClientURLs returns the advertised addresses for the local node.
// During cluster reset/restore we only listen on loopback to avoid having apiservers
// on other nodes connect mid-process.
func ( e * ETCD ) advertiseClientURLs ( reset bool ) string {
if reset {
2024-01-16 22:43:08 +00:00
return fmt . Sprintf ( "https://%s:2379" , e . config . Loopback ( true ) )
2023-03-24 22:19:44 +00:00
}
return e . clientURL ( )
}
2022-05-05 08:10:08 +00:00
// listenClientURLs returns a list of URLs to bind to for client connections.
2023-03-24 22:19:44 +00:00
// During cluster reset/restore, we only listen on loopback to avoid having apiservers
// on other nodes connect mid-process.
2022-05-05 08:10:08 +00:00
func ( e * ETCD ) listenClientURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
clientURLs := fmt . Sprintf ( "https://%s:2379" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset {
clientURLs += "," + e . clientURL ( )
}
return clientURLs
}
// listenMetricsURLs returns a list of URLs to bind to for metrics connections.
func ( e * ETCD ) listenMetricsURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
metricsURLs := fmt . Sprintf ( "http://%s:2381" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset && e . config . EtcdExposeMetrics {
metricsURLs += "," + fmt . Sprintf ( "http://%s" , net . JoinHostPort ( e . address , "2381" ) )
2021-01-23 01:40:48 +00:00
}
2022-05-05 08:10:08 +00:00
return metricsURLs
2021-01-23 01:40:48 +00:00
}
2023-09-22 06:54:03 +00:00
// listenClientHTTPURLs returns a list of URLs to bind to for http client connections.
// This should no longer be used, but we must set it in order to free the listen URLs
// for dedicated use by GRPC.
// Ref: https://github.com/etcd-io/etcd/issues/15402
func ( e * ETCD ) listenClientHTTPURLs ( ) string {
return fmt . Sprintf ( "https://%s:2382" , e . config . Loopback ( true ) )
}
2022-05-05 08:10:08 +00:00
// cluster calls the executor to start etcd running with the provided configuration.
func ( e * ETCD ) cluster ( ctx context . Context , reset bool , options executor . InitialOptions ) error {
2022-04-27 20:44:15 +00:00
ctx , e . cancel = context . WithCancel ( ctx )
2021-09-08 17:56:18 +00:00
return executor . ETCD ( ctx , executor . ETCDConfig {
2020-05-05 22:02:16 +00:00
Name : e . name ,
InitialOptions : options ,
2022-05-05 08:10:08 +00:00
ForceNewCluster : reset ,
ListenClientURLs : e . listenClientURLs ( reset ) ,
ListenMetricsURLs : e . listenMetricsURLs ( reset ) ,
ListenPeerURLs : e . listenPeerURLs ( reset ) ,
2023-03-24 22:19:44 +00:00
AdvertiseClientURLs : e . advertiseClientURLs ( reset ) ,
2023-09-21 18:48:21 +00:00
DataDir : dbDir ( e . config ) ,
2020-05-05 22:02:16 +00:00
ServerTrust : executor . ServerTrust {
CertFile : e . config . Runtime . ServerETCDCert ,
KeyFile : e . config . Runtime . ServerETCDKey ,
ClientCertAuth : true ,
TrustedCAFile : e . config . Runtime . ETCDServerCA ,
} ,
PeerTrust : executor . PeerTrust {
CertFile : e . config . Runtime . PeerServerClientETCDCert ,
KeyFile : e . config . Runtime . PeerServerClientETCDKey ,
ClientCertAuth : true ,
TrustedCAFile : e . config . Runtime . ETCDPeerCA ,
} ,
2024-02-02 21:10:05 +00:00
SnapshotCount : 10000 ,
ElectionTimeout : 5000 ,
HeartbeatInterval : 500 ,
Logger : "zap" ,
LogOutputs : [ ] string { "stderr" } ,
ListenClientHTTPURLs : e . listenClientHTTPURLs ( ) ,
ExperimentalInitialCorruptCheck : true ,
ExperimentalWatchProgressNotifyInterval : e . config . Datastore . NotifyInterval ,
2021-11-12 05:03:15 +00:00
} , e . config . ExtraEtcdArgs )
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
func ( e * ETCD ) StartEmbeddedTemporary ( ctx context . Context ) error {
2023-09-21 18:48:21 +00:00
etcdDataDir := dbDir ( e . config )
2022-02-24 22:35:08 +00:00
tmpDataDir := etcdDataDir + "-tmp"
os . RemoveAll ( tmpDataDir )
2022-03-25 18:52:40 +00:00
go func ( ) {
<- ctx . Done ( )
2022-02-24 22:35:08 +00:00
if err := os . RemoveAll ( tmpDataDir ) ; err != nil {
logrus . Warnf ( "Failed to remove etcd temp dir: %v" , err )
}
} ( )
2023-09-21 18:53:50 +00:00
if e . client != nil {
return errors . New ( "etcd datastore already started" )
}
2024-10-24 00:16:34 +00:00
client , conn , err := getClient ( ctx , e . config )
2023-09-21 18:53:50 +00:00
if err != nil {
return err
}
e . client = client
go func ( ) {
<- ctx . Done ( )
e . client = nil
2024-10-24 00:16:34 +00:00
conn . Close ( )
2023-09-21 18:53:50 +00:00
} ( )
2022-02-24 22:35:08 +00:00
if err := cp . Copy ( etcdDataDir , tmpDataDir , cp . Options { PreserveOwner : true } ) ; err != nil {
return err
}
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
clientURL := endpoints [ 0 ]
2023-09-22 06:54:03 +00:00
// peer URL is usually 1 more than client
2022-02-24 22:35:08 +00:00
peerURL , err := addPort ( endpoints [ 0 ] , 1 )
if err != nil {
return err
}
2023-09-22 06:54:03 +00:00
// client http URL is usually 3 more than client, after peer and metrics
clientHTTPURL , err := addPort ( endpoints [ 0 ] , 3 )
if err != nil {
return err
}
2022-02-24 22:35:08 +00:00
embedded := executor . Embedded { }
2022-04-27 20:44:15 +00:00
ctx , e . cancel = context . WithCancel ( ctx )
2022-02-24 22:35:08 +00:00
return embedded . ETCD ( ctx , executor . ETCDConfig {
2024-02-02 21:10:05 +00:00
InitialOptions : executor . InitialOptions { AdvertisePeerURL : peerURL } ,
DataDir : tmpDataDir ,
ForceNewCluster : true ,
AdvertiseClientURLs : clientURL ,
ListenClientURLs : clientURL ,
ListenClientHTTPURLs : clientHTTPURL ,
ListenPeerURLs : peerURL ,
Logger : "zap" ,
HeartbeatInterval : 500 ,
ElectionTimeout : 5000 ,
SnapshotCount : 10000 ,
Name : e . name ,
LogOutputs : [ ] string { "stderr" } ,
ExperimentalInitialCorruptCheck : true ,
ExperimentalWatchProgressNotifyInterval : e . config . Datastore . NotifyInterval ,
2023-04-27 21:13:39 +00:00
} , append ( e . config . ExtraEtcdArgs , "--max-snapshots=0" , "--max-wals=0" ) )
2022-02-24 22:35:08 +00:00
}
func addPort ( address string , offset int ) ( string , error ) {
u , err := url . Parse ( address )
if err != nil {
return "" , err
}
port , err := strconv . Atoi ( u . Port ( ) )
if err != nil {
return "" , err
}
port += offset
return fmt . Sprintf ( "%s://%s:%d" , u . Scheme , u . Hostname ( ) , port ) , nil
}
2021-09-14 15:20:38 +00:00
// RemovePeer removes a peer from the cluster. The peer name and IP address must both match.
func ( e * ETCD ) RemovePeer ( ctx context . Context , name , address string , allowSelfRemoval bool ) error {
2021-08-05 20:32:01 +00:00
ctx , cancel := context . WithTimeout ( ctx , memberRemovalTimeout )
defer cancel ( )
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return err
}
for _ , member := range members . Members {
2021-09-14 15:20:38 +00:00
if member . Name != name {
2020-05-05 22:02:16 +00:00
continue
}
for _ , peerURL := range member . PeerURLs {
u , err := url . Parse ( peerURL )
if err != nil {
return err
}
if u . Hostname ( ) == address {
2021-09-14 15:20:38 +00:00
if e . address == address && ! allowSelfRemoval {
return errors . New ( "not removing self from etcd cluster" )
2020-10-28 16:32:51 +00:00
}
2020-05-05 22:02:16 +00:00
logrus . Infof ( "Removing name=%s id=%d address=%s from etcd" , member . Name , member . ID , address )
_ , err := e . client . MemberRemove ( ctx , member . ID )
2023-04-05 00:52:14 +00:00
if errors . Is ( err , rpctypes . ErrGRPCMemberNotFound ) {
2020-10-28 16:32:51 +00:00
return nil
}
2020-05-05 22:02:16 +00:00
return err
}
}
}
return nil
}
2020-07-29 20:52:49 +00:00
2020-10-27 18:06:26 +00:00
// manageLearners monitors the etcd cluster to ensure that learners are making progress towards
// being promoted to full voting member. The checks only run on the cluster member that is
// the etcd leader.
2022-02-23 21:52:46 +00:00
func ( e * ETCD ) manageLearners ( ctx context . Context ) {
2024-02-21 18:26:13 +00:00
<- e . config . Runtime . ContainerRuntimeReady
2020-10-27 18:06:26 +00:00
t := time . NewTicker ( manageTickerTime )
2020-07-29 20:52:49 +00:00
defer t . Stop ( )
2020-10-27 18:06:26 +00:00
2020-07-29 20:52:49 +00:00
for range t . C {
2022-07-20 00:21:23 +00:00
ctx , cancel := context . WithTimeout ( ctx , manageTickerTime )
2020-10-27 18:06:26 +00:00
defer cancel ( )
// Check to see if the local node is the leader. Only the leader should do learner management.
2021-07-26 16:59:33 +00:00
if e . client == nil {
2022-02-23 21:52:46 +00:00
logrus . Debug ( "Etcd client was nil" )
2021-07-26 16:59:33 +00:00
continue
}
2023-12-21 00:23:27 +00:00
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
if status , err := e . client . Status ( ctx , endpoints [ 0 ] ) ; err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to check local etcd status for learner management: %v" , err )
continue
} else if status . Header . MemberId != status . Leader {
continue
}
progress , err := e . getLearnerProgress ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to get recorded learner progress from etcd: %v" , err )
2020-07-29 20:52:49 +00:00
continue
}
2020-10-27 18:06:26 +00:00
members , err := e . client . MemberList ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to get etcd members for learner management: %v" , err )
2020-07-29 20:52:49 +00:00
continue
}
2020-10-27 18:06:26 +00:00
2023-11-16 23:58:42 +00:00
client , err := util . GetClientSet ( e . config . Runtime . KubeConfigSupervisor )
2023-11-13 14:39:24 +00:00
if err != nil {
2023-11-16 23:58:42 +00:00
logrus . Errorf ( "Failed to get k8s client for patch node status condition: %v" , err )
2023-11-13 14:39:24 +00:00
continue
}
2023-11-16 23:58:42 +00:00
nodes , err := e . getETCDNodes ( )
2023-11-13 14:39:24 +00:00
if err != nil {
2023-11-16 23:58:42 +00:00
logrus . Warnf ( "Failed to list nodes with etcd role: %v" , err )
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
// a map to track if a node is a member of the etcd cluster or not
nodeIsMember := make ( map [ string ] bool )
nodesMap := make ( map [ string ] * v1 . Node )
for _ , node := range nodes {
nodeIsMember [ node . Name ] = false
nodesMap [ node . Name ] = node
}
2020-07-29 20:52:49 +00:00
for _ , member := range members . Members {
2024-01-10 19:37:56 +00:00
status := StatusVoter
message := ""
2023-12-21 00:23:27 +00:00
2020-10-27 18:06:26 +00:00
if member . IsLearner {
2024-01-10 19:37:56 +00:00
status = StatusLearner
2020-10-27 18:06:26 +00:00
if err := e . trackLearnerProgress ( ctx , progress , member ) ; err != nil {
logrus . Errorf ( "Failed to track learner progress towards promotion: %v" , err )
}
2024-01-10 19:37:56 +00:00
}
2023-12-21 00:23:27 +00:00
2024-01-10 19:37:56 +00:00
var node * v1 . Node
for _ , n := range nodes {
2024-02-16 18:13:33 +00:00
if member . Name == n . Annotations [ NodeNameAnnotation ] {
2024-01-10 19:37:56 +00:00
node = n
nodeIsMember [ n . Name ] = true
break
2023-11-13 14:39:24 +00:00
}
2024-01-10 19:37:56 +00:00
}
if node == nil {
continue
2020-07-29 20:52:49 +00:00
}
2023-11-13 14:39:24 +00:00
2024-01-10 19:37:56 +00:00
// verify if the member is healthy and set the status
2023-12-21 00:23:27 +00:00
if _ , err := e . getETCDStatus ( ctx , member . ClientURLs [ 0 ] ) ; err != nil {
2024-01-10 19:37:56 +00:00
message = err . Error ( )
status = StatusUnhealthy
2023-12-21 00:23:27 +00:00
}
2024-01-10 19:37:56 +00:00
if err := e . setEtcdStatusCondition ( node , client , member . Name , status , message ) ; err != nil {
2023-12-21 00:23:27 +00:00
logrus . Errorf ( "Unable to set etcd status condition %s: %v" , member . Name , err )
}
}
2024-01-10 19:37:56 +00:00
for nodeName , node := range nodesMap {
if ! nodeIsMember [ nodeName ] {
2023-12-21 00:23:27 +00:00
if err := e . setEtcdStatusCondition ( node , client , nodeName , StatusUnjoined , "" ) ; err != nil {
logrus . Errorf ( "Unable to set etcd status condition for a node that is not a cluster member %s: %v" , nodeName , err )
2023-11-13 14:39:24 +00:00
}
}
2020-07-29 20:52:49 +00:00
}
}
2023-11-13 14:39:24 +00:00
}
2023-11-16 23:58:42 +00:00
func ( e * ETCD ) getETCDNodes ( ) ( [ ] * v1 . Node , error ) {
if e . config . Runtime . Core == nil {
2024-04-25 23:49:47 +00:00
return nil , util . ErrCoreNotReady
2023-11-16 23:58:42 +00:00
}
2023-11-13 14:39:24 +00:00
nodes := e . config . Runtime . Core . Core ( ) . V1 ( ) . Node ( )
etcdSelector := labels . Set { util . ETCDRoleLabelKey : "true" }
2023-11-16 23:58:42 +00:00
return nodes . Cache ( ) . List ( etcdSelector . AsSelector ( ) )
2020-07-29 20:52:49 +00:00
}
2020-10-27 18:06:26 +00:00
// trackLearnerProcess attempts to promote a learner. If it cannot be promoted, progress through the raft index is tracked.
// If the learner does not make any progress in a reasonable amount of time, it is evicted from the cluster.
func ( e * ETCD ) trackLearnerProgress ( ctx context . Context , progress * learnerProgress , member * etcdserverpb . Member ) error {
// Try to promote it. If it can be promoted, no further tracking is necessary
if _ , err := e . client . MemberPromote ( ctx , member . ID ) ; err != nil {
logrus . Debugf ( "Unable to promote learner %s: %v" , member . Name , err )
} else {
logrus . Infof ( "Promoted learner %s" , member . Name )
return nil
}
now := time . Now ( )
// If this is the first time we've tracked this member's progress, reset stats
if progress . Name != member . Name || progress . ID != member . ID {
progress . ID = member . ID
progress . Name = member . Name
progress . RaftAppliedIndex = 0
progress . LastProgress . Time = now
}
// Update progress by retrieving status from the member's first reachable client URL
for _ , ep := range member . ClientURLs {
2023-12-21 00:23:27 +00:00
status , err := e . getETCDStatus ( ctx , ep )
2020-10-27 18:06:26 +00:00
if err != nil {
logrus . Debugf ( "Failed to get etcd status from learner %s at %s: %v" , member . Name , ep , err )
continue
}
if progress . RaftAppliedIndex < status . RaftAppliedIndex {
logrus . Debugf ( "Learner %s has progressed from RaftAppliedIndex %d to %d" , progress . Name , progress . RaftAppliedIndex , status . RaftAppliedIndex )
progress . RaftAppliedIndex = status . RaftAppliedIndex
progress . LastProgress . Time = now
}
break
}
// Warn if the learner hasn't made any progress
if ! progress . LastProgress . Time . Equal ( now ) {
logrus . Warnf ( "Learner %s stalled at RaftAppliedIndex=%d for %s" , progress . Name , progress . RaftAppliedIndex , now . Sub ( progress . LastProgress . Time ) . String ( ) )
}
// See if it's time to evict yet
if now . Sub ( progress . LastProgress . Time ) > learnerMaxStallTime {
if _ , err := e . client . MemberRemove ( ctx , member . ID ) ; err != nil {
return err
}
logrus . Warnf ( "Removed learner %s from etcd cluster" , member . Name )
return nil
}
return e . setLearnerProgress ( ctx , progress )
}
2023-12-21 00:23:27 +00:00
func ( e * ETCD ) getETCDStatus ( ctx context . Context , url string ) ( * clientv3 . StatusResponse , error ) {
resp , err := e . client . Status ( ctx , url )
if err != nil {
return resp , errors . Wrap ( err , "failed to check etcd member status" )
}
if len ( resp . Errors ) != 0 {
return resp , errors . New ( "etcd member has status errors: " + strings . Join ( resp . Errors , "," ) )
}
return resp , nil
}
2023-11-13 14:39:24 +00:00
2023-12-21 00:23:27 +00:00
func ( e * ETCD ) setEtcdStatusCondition ( node * v1 . Node , client kubernetes . Interface , memberName string , memberStatus MemberStatus , message string ) error {
2023-11-13 14:39:24 +00:00
var newCondition v1 . NodeCondition
2023-12-21 00:23:27 +00:00
switch memberStatus {
case StatusLearner :
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "False" ,
Reason : "MemberIsLearner" ,
Message : "Node has not been promoted to voting member of the etcd cluster" ,
}
case StatusVoter :
2023-11-13 14:39:24 +00:00
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "True" ,
Reason : "MemberNotLearner" ,
Message : "Node is a voting member of the etcd cluster" ,
}
2023-12-21 00:23:27 +00:00
case StatusUnhealthy :
2023-11-13 14:39:24 +00:00
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "False" ,
2023-12-21 00:23:27 +00:00
Reason : "Unhealthy" ,
Message : "Node is unhealthy" ,
}
case StatusUnjoined :
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "False" ,
Reason : "NotAMember" ,
Message : "Node is not a member of the etcd cluster" ,
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
default :
logrus . Warnf ( "Unknown etcd member status %s" , memberStatus )
return nil
}
if message != "" {
newCondition . Message = message
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
if find , condition := nodeUtil . GetNodeCondition ( & node . Status , etcdStatusType ) ; find >= 0 {
2024-01-18 05:00:18 +00:00
// if the condition is not changing, we only want to update the last heartbeat time
if condition . Status == newCondition . Status && condition . Reason == newCondition . Reason && condition . Message == newCondition . Message {
2023-12-21 00:23:27 +00:00
logrus . Debugf ( "Node %s is not changing etcd status condition" , memberName )
2024-01-18 05:00:18 +00:00
// If the condition status is not changing, we only want to update the last heartbeat time if the
// LastHeartbeatTime is older than the heartbeatTimeout.
if metav1 . Now ( ) . Sub ( condition . LastHeartbeatTime . Time ) < heartbeatInterval {
return nil
}
2023-11-13 14:39:24 +00:00
condition . LastHeartbeatTime = metav1 . Now ( )
2023-12-21 00:23:27 +00:00
return nodeHelper . SetNodeCondition ( client , types . NodeName ( node . Name ) , * condition )
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
logrus . Debugf ( "Node %s is changing etcd status condition" , memberName )
2023-11-13 14:39:24 +00:00
condition = & newCondition
condition . LastHeartbeatTime = metav1 . Now ( )
condition . LastTransitionTime = metav1 . Now ( )
2023-12-21 00:23:27 +00:00
return nodeHelper . SetNodeCondition ( client , types . NodeName ( node . Name ) , * condition )
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
logrus . Infof ( "Adding node %s etcd status condition" , memberName )
2023-11-13 14:39:24 +00:00
newCondition . LastHeartbeatTime = metav1 . Now ( )
newCondition . LastTransitionTime = metav1 . Now ( )
2023-12-21 00:23:27 +00:00
return nodeHelper . SetNodeCondition ( client , types . NodeName ( node . Name ) , newCondition )
2023-11-13 14:39:24 +00:00
}
2020-10-27 18:06:26 +00:00
// getLearnerProgress returns the stored learnerProgress struct as retrieved from etcd
func ( e * ETCD ) getLearnerProgress ( ctx context . Context ) ( * learnerProgress , error ) {
progress := & learnerProgress { }
value , err := e . client . Get ( ctx , learnerProgressKey )
if err != nil {
return nil , err
}
if value . Count < 1 {
return progress , nil
}
if err := json . NewDecoder ( bytes . NewBuffer ( value . Kvs [ 0 ] . Value ) ) . Decode ( progress ) ; err != nil {
return nil , err
}
return progress , nil
}
// setLearnerProgress stores the learnerProgress struct to etcd
func ( e * ETCD ) setLearnerProgress ( ctx context . Context , status * learnerProgress ) error {
w := & bytes . Buffer { }
if err := json . NewEncoder ( w ) . Encode ( status ) ; err != nil {
return err
}
_ , err := e . client . Put ( ctx , learnerProgressKey , w . String ( ) )
return err
}
2024-10-24 00:16:34 +00:00
// clearAlarms checks for any NOSPACE alarms on the local etcd member.
// If found, they are reported and the alarm state is cleared.
// Other alarm types are not handled.
func ( e * ETCD ) clearAlarms ( ctx context . Context , memberID uint64 ) error {
2022-02-23 21:52:46 +00:00
if e . client == nil {
return errors . New ( "etcd client was nil" )
}
alarmList , err := e . client . AlarmList ( ctx )
if err != nil {
return fmt . Errorf ( "etcd alarm list failed: %v" , err )
}
for _ , alarm := range alarmList . Alarms {
2024-10-24 00:16:34 +00:00
if alarm . MemberID != memberID {
// ignore alarms on other cluster members, they should manage their own problems
continue
}
if alarm . Alarm == etcdserverpb . AlarmType_NOSPACE {
if _ , err := e . client . AlarmDisarm ( ctx , & clientv3 . AlarmMember { MemberID : alarm . MemberID , Alarm : alarm . Alarm } ) ; err != nil {
return fmt . Errorf ( "%s disarm failed: %v" , alarm . Alarm , err )
}
logrus . Infof ( "%s disarmed successfully" , alarm . Alarm )
} else {
return fmt . Errorf ( "%s alarm must be disarmed manually" , alarm . Alarm )
2022-02-23 21:52:46 +00:00
}
}
return nil
}
2024-10-24 00:16:34 +00:00
// status returns status using the first etcd endpoint.
func ( e * ETCD ) status ( ctx context . Context ) ( * clientv3 . StatusResponse , error ) {
if e . client == nil {
return nil , errors . New ( "etcd client was nil" )
}
ctx , cancel := context . WithTimeout ( ctx , statusTimeout )
2022-03-25 18:52:40 +00:00
defer cancel ( )
2024-10-24 00:16:34 +00:00
endpoints := getEndpoints ( e . config )
return e . client . Status ( ctx , endpoints [ 0 ] )
}
// defragment defragments the etcd datastore using the first etcd endpoint
func ( e * ETCD ) defragment ( ctx context . Context ) error {
2022-03-25 18:52:40 +00:00
if e . client == nil {
return errors . New ( "etcd client was nil" )
}
logrus . Infof ( "Defragmenting etcd database" )
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-03-25 18:52:40 +00:00
_ , err := e . client . Defragment ( ctx , endpoints [ 0 ] )
return err
}
2022-02-24 22:35:08 +00:00
// clientURLs returns a list of all non-learner etcd cluster member client access URLs.
// The list is retrieved from the remote server that is being joined.
2021-03-01 21:50:50 +00:00
func ClientURLs ( ctx context . Context , clientAccessInfo * clientaccess . Info , selfIP string ) ( [ ] string , Members , error ) {
2020-07-29 20:52:49 +00:00
var memberList Members
2024-04-08 18:04:27 +00:00
// find the address advertised for our own client URL, so that we don't connect to ourselves
ip , err := getAdvertiseAddress ( selfIP )
2020-07-29 20:52:49 +00:00
if err != nil {
2024-04-08 18:04:27 +00:00
return nil , memberList , err
2020-07-29 20:52:49 +00:00
}
2024-04-08 18:04:27 +00:00
// find the client URL of the server we're joining, so we can prioritize it
joinURL , err := url . Parse ( clientAccessInfo . BaseURL )
if err != nil {
2020-07-29 20:52:49 +00:00
return nil , memberList , err
}
2024-04-08 18:04:27 +00:00
// get the full list from the server we're joining
resp , err := clientAccessInfo . Get ( "/db/info" )
2021-03-01 21:50:50 +00:00
if err != nil {
2024-06-11 00:29:17 +00:00
return nil , memberList , & memberListError { err : err }
2024-04-08 18:04:27 +00:00
}
if err := json . Unmarshal ( resp , & memberList ) ; err != nil {
2021-03-01 21:50:50 +00:00
return nil , memberList , err
}
2024-04-08 18:04:27 +00:00
// Build a list of client URLs. Learners and the current node are excluded;
// the server we're joining is listed first if found.
2020-07-29 20:52:49 +00:00
var clientURLs [ ] string
for _ , member := range memberList . Members {
2024-04-08 18:04:27 +00:00
var isSelf , isPreferred bool
2021-08-12 22:59:04 +00:00
for _ , clientURL := range member . ClientURLs {
2024-04-08 18:04:27 +00:00
if u , err := url . Parse ( clientURL ) ; err == nil {
switch u . Hostname ( ) {
case ip :
isSelf = true
case joinURL . Hostname ( ) :
isPreferred = true
}
2021-08-12 22:59:04 +00:00
}
2024-04-08 18:04:27 +00:00
}
if ! member . IsLearner && ! isSelf {
if isPreferred {
clientURLs = append ( member . ClientURLs , clientURLs ... )
} else {
clientURLs = append ( clientURLs , member . ClientURLs ... )
2021-03-01 21:50:50 +00:00
}
}
2020-07-29 20:52:49 +00:00
}
return clientURLs , memberList , nil
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// Restore performs a restore of the ETCD datastore from
// the given snapshot path. This operation exists upon
// completion.
func ( e * ETCD ) Restore ( ctx context . Context ) error {
// check the old etcd data dir
2023-09-21 18:48:21 +00:00
oldDataDir := dbDir ( e . config ) + "-old-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
2020-09-30 00:53:31 +00:00
if e . config . ClusterResetRestorePath == "" {
return errors . New ( "no etcd restore path was specified" )
}
// make sure snapshot exists before restoration
if _ , err := os . Stat ( e . config . ClusterResetRestorePath ) ; err != nil {
return err
}
2022-01-14 17:31:22 +00:00
var restorePath string
2024-06-11 00:29:17 +00:00
if strings . HasSuffix ( e . config . ClusterResetRestorePath , snapshot . CompressedExtension ) {
dir , err := snapshotDir ( e . config , true )
2022-01-14 17:31:22 +00:00
if err != nil {
return errors . Wrap ( err , "failed to get the snapshot dir" )
}
2024-06-11 00:29:17 +00:00
decompressSnapshot , err := e . decompressSnapshot ( dir , e . config . ClusterResetRestorePath )
2022-01-14 17:31:22 +00:00
if err != nil {
return err
}
restorePath = decompressSnapshot
} else {
restorePath = e . config . ClusterResetRestorePath
}
2020-09-30 00:53:31 +00:00
// move the data directory to a temp path
2023-09-21 18:48:21 +00:00
if err := os . Rename ( dbDir ( e . config ) , oldDataDir ) ; err != nil {
2020-09-30 00:53:31 +00:00
return err
}
2022-01-14 17:31:22 +00:00
2020-09-30 00:53:31 +00:00
logrus . Infof ( "Pre-restore etcd database moved to %s" , oldDataDir )
2024-06-11 00:29:17 +00:00
return snapshotv3 . NewV3 ( e . client . GetLogger ( ) ) . Restore ( snapshotv3 . RestoreConfig {
2022-01-14 17:31:22 +00:00
SnapshotPath : restorePath ,
2020-09-30 00:53:31 +00:00
Name : e . name ,
2023-09-21 18:48:21 +00:00
OutputDataDir : dbDir ( e . config ) ,
2020-09-30 00:53:31 +00:00
OutputWALDir : walDir ( e . config ) ,
PeerURLs : [ ] string { e . peerURL ( ) } ,
InitialCluster : e . name + "=" + e . peerURL ( ) ,
2021-07-03 11:24:58 +00:00
} )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-12-07 20:30:44 +00:00
// backupDirWithRetention will move the dir to a backup dir
// and will keep only maxBackupRetention of dirs.
func backupDirWithRetention ( dir string , maxBackupRetention int ) ( string , error ) {
backupDir := dir + "-backup-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
if _ , err := os . Stat ( dir ) ; err != nil {
return "" , nil
}
2022-10-08 00:36:57 +00:00
entries , err := os . ReadDir ( filepath . Dir ( dir ) )
if err != nil {
return "" , err
}
files := make ( [ ] fs . FileInfo , 0 , len ( entries ) )
for _ , entry := range entries {
info , err := entry . Info ( )
if err != nil {
return "" , err
}
files = append ( files , info )
}
2020-12-07 20:30:44 +00:00
if err != nil {
return "" , err
}
sort . Slice ( files , func ( i , j int ) bool {
return files [ i ] . ModTime ( ) . After ( files [ j ] . ModTime ( ) )
} )
count := 0
for _ , f := range files {
if strings . HasPrefix ( f . Name ( ) , filepath . Base ( dir ) + "-backup" ) && f . IsDir ( ) {
count ++
if count > maxBackupRetention {
if err := os . RemoveAll ( filepath . Join ( filepath . Dir ( dir ) , f . Name ( ) ) ) ; err != nil {
return "" , err
}
}
}
}
// move the directory to a temp path
if err := os . Rename ( dir , backupDir ) ; err != nil {
return "" , err
}
return backupDir , nil
}
2021-02-12 15:35:57 +00:00
2022-02-16 22:19:58 +00:00
// GetAPIServerURLsFromETCD will try to fetch the version.Program/apiaddresses key from etcd
2022-02-24 22:35:08 +00:00
// and unmarshal it to a list of apiserver endpoints.
2022-02-16 22:19:58 +00:00
func GetAPIServerURLsFromETCD ( ctx context . Context , cfg * config . Control ) ( [ ] string , error ) {
2024-10-24 00:16:34 +00:00
cl , conn , err := getClient ( ctx , cfg )
2021-02-12 15:35:57 +00:00
if err != nil {
2022-02-16 22:19:58 +00:00
return nil , err
2021-02-12 15:35:57 +00:00
}
2024-10-24 00:16:34 +00:00
defer conn . Close ( )
2022-03-10 22:03:02 +00:00
2021-02-12 15:35:57 +00:00
etcdResp , err := cl . KV . Get ( ctx , AddressKey )
if err != nil {
2022-02-16 22:19:58 +00:00
return nil , err
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
if etcdResp . Count == 0 || len ( etcdResp . Kvs [ 0 ] . Value ) == 0 {
return nil , ErrAddressNotSet
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
2021-02-12 15:35:57 +00:00
var addresses [ ] string
if err := json . Unmarshal ( etcdResp . Kvs [ 0 ] . Value , & addresses ) ; err != nil {
2022-02-16 22:19:58 +00:00
return nil , fmt . Errorf ( "failed to unmarshal apiserver addresses from etcd: %v" , err )
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
return addresses , nil
2021-02-12 15:35:57 +00:00
}
// GetMembersClientURLs will list through the member lists in etcd and return
// back a combined list of client urls for each member in the cluster
func ( e * ETCD ) GetMembersClientURLs ( ctx context . Context ) ( [ ] string , error ) {
2024-04-08 18:04:27 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return nil , err
}
var clientURLs [ ] string
for _ , member := range members . Members {
if ! member . IsLearner {
clientURLs = append ( clientURLs , member . ClientURLs ... )
}
}
return clientURLs , nil
2021-02-12 15:35:57 +00:00
}
2021-03-01 21:50:50 +00:00
2023-10-13 22:47:12 +00:00
// RemoveSelf will remove the member if it exists in the cluster. This should
// only be called on a node that may have previously run etcd, but will not
// currently run etcd, to ensure that it is not a member of the cluster.
// This is also called by tests to do cleanup between runs.
2021-03-01 21:50:50 +00:00
func ( e * ETCD ) RemoveSelf ( ctx context . Context ) error {
2023-10-13 22:47:12 +00:00
if e . client == nil {
if err := e . startClient ( ctx ) ; err != nil {
return err
}
}
2021-09-14 15:20:38 +00:00
if err := e . RemovePeer ( ctx , e . name , e . address , true ) ; err != nil {
2021-03-16 16:14:43 +00:00
return err
}
// backup the data dir to avoid issues when re-enabling etcd
2023-09-21 18:48:21 +00:00
oldDataDir := dbDir ( e . config ) + "-old-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
2021-03-16 16:14:43 +00:00
// move the data directory to a temp path
2023-09-21 18:48:21 +00:00
return os . Rename ( dbDir ( e . config ) , oldDataDir )
2021-03-01 21:50:50 +00:00
}