2020-05-05 22:02:16 +00:00
package etcd
import (
2020-10-27 18:06:26 +00:00
"bytes"
2020-05-05 22:02:16 +00:00
"context"
"crypto/tls"
"encoding/json"
"fmt"
2022-10-08 00:36:57 +00:00
"io/fs"
2022-04-15 00:31:49 +00:00
"net"
2020-05-05 22:02:16 +00:00
"net/http"
"net/url"
"os"
"path/filepath"
2022-06-14 22:12:28 +00:00
"regexp"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
"sort"
"strconv"
2020-05-05 22:02:16 +00:00
"strings"
"time"
"github.com/google/uuid"
"github.com/gorilla/mux"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/clientaccess"
2023-09-21 18:53:50 +00:00
"github.com/k3s-io/k3s/pkg/cluster/managed"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/daemons/config"
"github.com/k3s-io/k3s/pkg/daemons/control/deps"
"github.com/k3s-io/k3s/pkg/daemons/executor"
2023-11-13 14:39:24 +00:00
"github.com/k3s-io/k3s/pkg/util"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/version"
2021-04-26 16:47:53 +00:00
"github.com/k3s-io/kine/pkg/client"
endpoint2 "github.com/k3s-io/kine/pkg/endpoint"
2022-02-24 22:35:08 +00:00
cp "github.com/otiai10/copy"
2020-05-05 22:02:16 +00:00
"github.com/pkg/errors"
certutil "github.com/rancher/dynamiclistener/cert"
2021-09-14 15:20:38 +00:00
controllerv1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
"github.com/robfig/cron/v3"
2020-05-05 22:02:16 +00:00
"github.com/sirupsen/logrus"
2021-07-02 19:55:47 +00:00
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
2022-04-13 23:22:07 +00:00
"go.etcd.io/etcd/client/pkg/v3/logutil"
2021-07-02 19:55:47 +00:00
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/etcdutl/v3/snapshot"
2022-04-13 23:22:07 +00:00
"go.uber.org/zap"
2022-07-09 01:27:05 +00:00
"golang.org/x/sync/semaphore"
2023-11-13 14:39:24 +00:00
v1 "k8s.io/api/core/v1"
2020-10-27 18:06:26 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2023-11-13 14:39:24 +00:00
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
2020-05-05 22:02:16 +00:00
utilnet "k8s.io/apimachinery/pkg/util/net"
2023-01-10 18:51:39 +00:00
"k8s.io/apimachinery/pkg/util/wait"
2023-12-21 00:23:27 +00:00
"k8s.io/client-go/kubernetes"
2023-11-13 14:39:24 +00:00
nodeHelper "k8s.io/component-helpers/node/util"
nodeUtil "k8s.io/kubernetes/pkg/controller/util/node"
2021-02-12 15:35:57 +00:00
)
2020-10-27 18:06:26 +00:00
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
const (
2022-07-20 00:21:23 +00:00
testTimeout = time . Second * 30
2021-08-05 20:32:01 +00:00
manageTickerTime = time . Second * 15
learnerMaxStallTime = time . Minute * 5
memberRemovalTimeout = time . Minute * 1
2020-10-27 18:06:26 +00:00
2023-01-10 18:51:39 +00:00
// snapshotJitterMax defines the maximum time skew on cron-triggered snapshots. The actual jitter
// will be a random Duration somewhere between 0 and snapshotJitterMax.
snapshotJitterMax = time . Second * 5
2020-10-27 18:06:26 +00:00
// defaultDialTimeout is intentionally short so that connections timeout within the testTimeout defined above
defaultDialTimeout = 2 * time . Second
// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
defaultKeepAliveTime = 30 * time . Second
defaultKeepAliveTimeout = 10 * time . Second
2024-01-18 05:00:18 +00:00
heartbeatInterval = 5 * time . Minute
2020-12-07 20:30:44 +00:00
2023-09-28 00:28:03 +00:00
maxBackupRetention = 5
2023-12-21 00:23:27 +00:00
etcdStatusType = v1 . NodeConditionType ( "EtcdIsVoter" )
StatusUnjoined MemberStatus = "unjoined"
StatusUnhealthy MemberStatus = "unhealthy"
StatusLearner MemberStatus = "learner"
StatusVoter MemberStatus = "voter"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
)
2021-05-01 01:26:39 +00:00
var (
learnerProgressKey = version . Program + "/etcd/learnerProgress"
// AddressKey will contain the value of api addresses list
AddressKey = version . Program + "/apiaddresses"
2021-09-14 15:20:38 +00:00
NodeNameAnnotation = "etcd." + version . Program + ".cattle.io/node-name"
NodeAddressAnnotation = "etcd." + version . Program + ".cattle.io/node-address"
2022-02-16 22:19:58 +00:00
ErrAddressNotSet = errors . New ( "apiserver addresses not yet set" )
2022-03-01 23:55:35 +00:00
ErrNotMember = errNotMember ( )
2022-06-14 22:12:28 +00:00
invalidKeyChars = regexp . MustCompile ( ` [^-._a-zA-Z0-9] ` )
2021-05-01 01:26:39 +00:00
)
2021-09-14 15:20:38 +00:00
type NodeControllerGetter func ( ) controllerv1 . NodeController
2023-09-21 18:53:50 +00:00
// explicit interface check
var _ managed . Driver = & ETCD { }
2023-12-21 00:23:27 +00:00
type MemberStatus string
2021-05-01 01:26:39 +00:00
type ETCD struct {
2022-07-09 01:27:05 +00:00
client * clientv3 . Client
config * config . Control
name string
address string
cron * cron . Cron
s3 * S3
cancel context . CancelFunc
snapshotSem * semaphore . Weighted
2021-05-01 01:26:39 +00:00
}
type learnerProgress struct {
ID uint64 ` json:"id,omitempty" `
Name string ` json:"name,omitempty" `
RaftAppliedIndex uint64 ` json:"raftAppliedIndex,omitempty" `
LastProgress metav1 . Time ` json:"lastProgress,omitempty" `
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// Members contains a slice that holds all
// members of the cluster.
2020-05-05 22:02:16 +00:00
type Members struct {
Members [ ] * etcdserverpb . Member ` json:"members" `
}
2022-03-01 23:55:35 +00:00
type MembershipError struct {
Self string
Members [ ] string
}
func ( e * MembershipError ) Error ( ) string {
return fmt . Sprintf ( "this server is a not a member of the etcd cluster. Found %v, expect: %s" , e . Members , e . Self )
}
func ( e * MembershipError ) Is ( target error ) bool {
switch target {
case ErrNotMember :
return true
}
return false
}
func errNotMember ( ) error { return & MembershipError { } }
2021-05-01 01:26:39 +00:00
// NewETCD creates a new value of type
// ETCD with an initialized cron value.
func NewETCD ( ) * ETCD {
return & ETCD {
2023-01-10 18:51:39 +00:00
cron : cron . New ( cron . WithLogger ( cronLogger ) ) ,
2021-05-01 01:26:39 +00:00
}
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// EndpointName returns the name of the endpoint.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) EndpointName ( ) string {
return "etcd"
}
2023-09-21 18:53:50 +00:00
// SetControlConfig passes the cluster config into the etcd datastore. This is necessary
// because the config may not yet be fully built at the time the Driver instance is registered.
func ( e * ETCD ) SetControlConfig ( config * config . Control ) error {
if e . config != nil {
return errors . New ( "control config already set" )
2022-02-24 22:35:08 +00:00
}
2023-09-21 18:53:50 +00:00
e . config = config
2022-03-10 22:03:02 +00:00
2023-09-21 18:53:50 +00:00
address , err := getAdvertiseAddress ( e . config . PrivateIP )
2022-02-24 22:35:08 +00:00
if err != nil {
return err
}
e . address = address
return e . setName ( false )
2021-05-07 23:10:04 +00:00
}
2022-03-25 18:52:40 +00:00
// Test ensures that the local node is a voting member of the target cluster,
// and that the datastore is defragmented and not in maintenance mode due to alarms.
2020-10-27 18:06:26 +00:00
// If it is still a learner or not a part of the cluster, an error is raised.
2022-03-25 18:52:40 +00:00
// If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised.
2020-10-27 18:06:26 +00:00
func ( e * ETCD ) Test ( ctx context . Context ) error {
2023-09-21 18:53:50 +00:00
if e . config == nil {
return errors . New ( "control config not set" )
}
if e . client == nil {
return errors . New ( "etcd datastore is not started" )
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
2020-05-05 22:02:16 +00:00
defer cancel ( )
2020-10-27 18:06:26 +00:00
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
status , err := e . client . Status ( ctx , endpoints [ 0 ] )
2020-07-29 20:52:49 +00:00
if err != nil {
return err
}
if status . IsLearner {
2020-10-27 18:06:26 +00:00
return errors . New ( "this server has not yet been promoted from learner to voting member" )
2020-07-29 20:52:49 +00:00
}
2020-10-27 18:06:26 +00:00
2022-03-25 18:52:40 +00:00
if err := e . defragment ( ctx ) ; err != nil {
return errors . Wrap ( err , "failed to defragment etcd database" )
}
2022-02-23 21:52:46 +00:00
if err := e . clearAlarms ( ctx ) ; err != nil {
return errors . Wrap ( err , "failed to report and disarm etcd alarms" )
}
2022-03-25 18:52:40 +00:00
// refresh status to see if any errors remain after clearing alarms
status , err = e . client . Status ( ctx , endpoints [ 0 ] )
if err != nil {
return err
}
if len ( status . Errors ) > 0 {
return fmt . Errorf ( "etcd cluster errors: %s" , strings . Join ( status . Errors , ", " ) )
}
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return err
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
var memberNameUrls [ ] string
2020-05-05 22:02:16 +00:00
for _ , member := range members . Members {
for _ , peerURL := range member . PeerURLs {
if peerURL == e . peerURL ( ) && e . name == member . Name {
return nil
}
}
if len ( member . PeerURLs ) > 0 {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
memberNameUrls = append ( memberNameUrls , member . Name + "=" + member . PeerURLs [ 0 ] )
2020-05-05 22:02:16 +00:00
}
}
2022-07-29 16:40:53 +00:00
return & MembershipError { Members : memberNameUrls , Self : e . name + "=" + e . peerURL ( ) }
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:48:21 +00:00
// dbDir returns the path to dataDir/db/etcd
func dbDir ( config * config . Control ) string {
2020-09-24 05:59:58 +00:00
return filepath . Join ( config . DataDir , "db" , "etcd" )
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:53:50 +00:00
// walDir returns the path to etcdDBDir/member/wal
2020-09-24 05:59:58 +00:00
func walDir ( config * config . Control ) string {
2023-09-21 18:48:21 +00:00
return filepath . Join ( dbDir ( config ) , "member" , "wal" )
2020-05-05 22:02:16 +00:00
}
2021-04-26 16:47:53 +00:00
func sqliteFile ( config * config . Control ) string {
return filepath . Join ( config . DataDir , "db" , "state.db" )
}
2023-09-21 18:53:50 +00:00
// nameFile returns the path to etcdDBDir/name.
2020-05-05 22:02:16 +00:00
func nameFile ( config * config . Control ) string {
2023-09-21 18:48:21 +00:00
return filepath . Join ( dbDir ( config ) , "name" )
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:53:50 +00:00
// clearReset removes the reset file
func ( e * ETCD ) clearReset ( ) error {
if err := os . Remove ( e . ResetFile ( ) ) ; err != nil && ! os . IsNotExist ( err ) {
return err
}
return nil
}
// IsReset checks to see if the reset file exists, indicating that a cluster-reset has been completed successfully.
func ( e * ETCD ) IsReset ( ) ( bool , error ) {
if e . config == nil {
return false , errors . New ( "control config not set" )
}
if _ , err := os . Stat ( e . ResetFile ( ) ) ; err != nil {
if ! os . IsNotExist ( err ) {
return false , err
}
return false , nil
}
return true , nil
}
// ResetFile returns the path to etcdDBDir/reset-flag.
func ( e * ETCD ) ResetFile ( ) string {
if e . config == nil {
panic ( "control config not set" )
}
return filepath . Join ( e . config . DataDir , "db" , "reset-flag" )
2020-09-30 00:53:31 +00:00
}
2020-09-22 03:23:18 +00:00
// IsInitialized checks to see if a WAL directory exists. If so, we assume that etcd
// has already been brought up at least once.
2023-09-21 18:53:50 +00:00
func ( e * ETCD ) IsInitialized ( ) ( bool , error ) {
if e . config == nil {
return false , errors . New ( "control config not set" )
}
dir := walDir ( e . config )
2020-09-22 03:23:18 +00:00
if s , err := os . Stat ( dir ) ; err == nil && s . IsDir ( ) {
2020-05-05 22:02:16 +00:00
return true , nil
} else if os . IsNotExist ( err ) {
return false , nil
} else {
2021-10-07 19:47:00 +00:00
return false , errors . Wrap ( err , "invalid state for wal directory " + dir )
2020-05-05 22:02:16 +00:00
}
}
2021-11-10 12:33:42 +00:00
// Reset resets an etcd node to a single node cluster.
2021-03-11 20:07:40 +00:00
func ( e * ETCD ) Reset ( ctx context . Context , rebootstrap func ( ) error ) error {
2020-09-22 03:23:18 +00:00
// Wait for etcd to come up as a new single-node cluster, then exit
2020-05-05 22:02:16 +00:00
go func ( ) {
2022-02-24 19:01:14 +00:00
<- e . config . Runtime . AgentReady
2020-07-29 20:52:49 +00:00
t := time . NewTicker ( 5 * time . Second )
defer t . Stop ( )
for range t . C {
2020-10-27 18:06:26 +00:00
if err := e . Test ( ctx ) ; err == nil {
2023-09-21 18:53:50 +00:00
// reset the apiaddresses to nil since we are doing a restoration
if _ , err := e . client . Put ( ctx , AddressKey , "" ) ; err != nil {
logrus . Warnf ( "failed to reset api addresses key in etcd: %v" , err )
continue
}
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
continue
}
2021-12-09 21:54:27 +00:00
if rebootstrap != nil {
// storageBootstrap() - runtime structure has been written with correct certificate data
if err := rebootstrap ( ) ; err != nil {
logrus . Fatal ( err )
}
2021-03-03 18:14:12 +00:00
}
// call functions to rewrite them from daemons/control/server.go (prepare())
2022-02-24 19:01:14 +00:00
if err := deps . GenServerDeps ( e . config ) ; err != nil {
2021-03-03 18:14:12 +00:00
logrus . Fatal ( err )
}
2020-05-05 22:02:16 +00:00
if len ( members . Members ) == 1 && members . Members [ 0 ] . Name == e . name {
2022-04-27 20:44:15 +00:00
// Cancel the etcd server context and allow it time to shutdown cleanly.
// Ideally we would use a waitgroup and properly sequence shutdown of the various components.
e . cancel ( )
time . Sleep ( time . Second * 5 )
logrus . Infof ( "Managed etcd cluster membership has been reset, restart without --cluster-reset flag now. Backup and delete ${datadir}/server/db on each peer etcd server and rejoin the nodes" )
2020-05-05 22:02:16 +00:00
os . Exit ( 0 )
}
2021-10-22 22:25:29 +00:00
} else {
// make sure that peer ips are updated to the node ip in case the test fails
members , err := e . client . MemberList ( ctx )
if err != nil {
logrus . Warnf ( "failed to list etcd members: %v" , err )
continue
}
if len ( members . Members ) > 1 {
logrus . Warnf ( "failed to update peer url: etcd still has more than one member" )
continue
}
if _ , err := e . client . MemberUpdate ( ctx , members . Members [ 0 ] . ID , [ ] string { e . peerURL ( ) } ) ; err != nil {
logrus . Warnf ( "failed to update peer url: %v" , err )
continue
}
2020-05-05 22:02:16 +00:00
}
2021-10-22 22:25:29 +00:00
2020-05-05 22:02:16 +00:00
}
} ( )
2020-09-22 03:23:18 +00:00
2023-09-21 18:53:50 +00:00
if err := e . startClient ( ctx ) ; err != nil {
return err
}
2020-09-22 03:23:18 +00:00
// If asked to restore from a snapshot, do so
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if e . config . ClusterResetRestorePath != "" {
2021-03-03 18:14:12 +00:00
if e . config . EtcdS3 {
2023-10-27 18:38:00 +00:00
logrus . Infof ( "Retrieving etcd snapshot %s from S3" , e . config . ClusterResetRestorePath )
2021-05-07 23:10:04 +00:00
if err := e . initS3IfNil ( ctx ) ; err != nil {
return err
2021-03-03 18:14:12 +00:00
}
2021-06-30 20:29:03 +00:00
if err := e . s3 . Download ( ctx ) ; err != nil {
2021-03-03 18:14:12 +00:00
return err
}
logrus . Infof ( "S3 download complete for %s" , e . config . ClusterResetRestorePath )
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
info , err := os . Stat ( e . config . ClusterResetRestorePath )
if os . IsNotExist ( err ) {
return fmt . Errorf ( "etcd: snapshot path does not exist: %s" , e . config . ClusterResetRestorePath )
}
if info . IsDir ( ) {
2020-09-22 03:23:18 +00:00
return fmt . Errorf ( "etcd: snapshot path must be a file, not a directory: %s" , e . config . ClusterResetRestorePath )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-19 01:09:36 +00:00
if err := e . Restore ( ctx ) ; err != nil {
return err
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-22 03:23:18 +00:00
2020-09-19 01:09:36 +00:00
if err := e . setName ( true ) ; err != nil {
return err
}
2020-09-30 00:53:31 +00:00
// touch a file to avoid multiple resets
2023-09-21 18:53:50 +00:00
if err := os . WriteFile ( e . ResetFile ( ) , [ ] byte { } , 0600 ) ; err != nil {
2020-09-30 00:53:31 +00:00
return err
}
2020-05-05 22:02:16 +00:00
return e . newCluster ( ctx , true )
}
2020-09-22 03:23:18 +00:00
// Start starts the datastore
2020-07-29 20:52:49 +00:00
func ( e * ETCD ) Start ( ctx context . Context , clientAccessInfo * clientaccess . Info ) error {
2023-09-21 18:53:50 +00:00
isInitialized , err := e . IsInitialized ( )
2020-05-05 22:02:16 +00:00
if err != nil {
2023-09-21 18:53:50 +00:00
return errors . Wrapf ( err , "failed to check for initialized etcd datastore" )
}
if err := e . startClient ( ctx ) ; err != nil {
return err
2020-05-05 22:02:16 +00:00
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if ! e . config . EtcdDisableSnapshots {
e . setSnapshotFunction ( ctx )
e . cron . Start ( )
}
2020-10-27 18:06:26 +00:00
go e . manageLearners ( ctx )
2022-02-16 22:19:58 +00:00
if isInitialized {
2021-01-06 17:05:49 +00:00
//check etcd dir permission
2023-09-21 18:48:21 +00:00
etcdDir := dbDir ( e . config )
2021-01-06 17:05:49 +00:00
info , err := os . Stat ( etcdDir )
if err != nil {
return err
}
if info . Mode ( ) != 0700 {
if err := os . Chmod ( etcdDir , 0700 ) ; err != nil {
return err
}
}
2020-05-05 22:02:16 +00:00
opt , err := executor . CurrentETCDOptions ( )
if err != nil {
return err
}
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd for existing cluster member" )
2020-05-05 22:02:16 +00:00
return e . cluster ( ctx , false , opt )
}
2020-07-29 20:52:49 +00:00
if clientAccessInfo == nil {
2020-05-05 22:02:16 +00:00
return e . newCluster ( ctx , false )
}
2021-05-01 01:26:39 +00:00
2021-10-12 06:13:10 +00:00
go func ( ) {
2022-02-16 22:19:58 +00:00
for {
select {
case <- time . After ( 30 * time . Second ) :
2023-04-25 22:35:22 +00:00
logrus . Infof ( "Waiting for agent to become ready before joining etcd cluster" )
2022-02-16 22:19:58 +00:00
case <- e . config . Runtime . AgentReady :
2023-04-25 22:35:22 +00:00
if err := wait . PollImmediateUntilWithContext ( ctx , time . Second , func ( ctx context . Context ) ( bool , error ) {
if err := e . join ( ctx , clientAccessInfo ) ; err != nil {
// Retry the join if waiting for another member to be promoted, or waiting for peers to connect after promotion
2023-04-27 21:13:39 +00:00
if errors . Is ( err , rpctypes . ErrTooManyLearners ) || errors . Is ( err , rpctypes . ErrUnhealthy ) {
logrus . Infof ( "Waiting for other members to finish joining etcd cluster: %v" , err )
2023-04-25 22:35:22 +00:00
return false , nil
}
return false , err
}
return true , nil
} ) ; err != nil {
logrus . Fatalf ( "etcd cluster join failed: %v" , err )
2022-02-16 22:19:58 +00:00
}
return
case <- ctx . Done ( ) :
return
}
2021-10-12 06:13:10 +00:00
}
} ( )
return nil
2020-05-05 22:02:16 +00:00
}
2023-09-21 18:53:50 +00:00
// startClient sets up the config's datastore endpoints, and starts an etcd client connected to the server endpoint.
// The client is destroyed when the context is closed.
func ( e * ETCD ) startClient ( ctx context . Context ) error {
if e . client != nil {
return errors . New ( "etcd datastore already started" )
}
endpoints := getEndpoints ( e . config )
e . config . Datastore . Endpoint = endpoints [ 0 ]
e . config . Datastore . BackendTLSConfig . CAFile = e . config . Runtime . ETCDServerCA
e . config . Datastore . BackendTLSConfig . CertFile = e . config . Runtime . ClientETCDCert
e . config . Datastore . BackendTLSConfig . KeyFile = e . config . Runtime . ClientETCDKey
client , err := getClient ( ctx , e . config , endpoints ... )
if err != nil {
return err
}
e . client = client
go func ( ) {
<- ctx . Done ( )
client := e . client
e . client = nil
client . Close ( )
} ( )
return nil
}
2020-09-22 03:23:18 +00:00
// join attempts to add a member to an existing cluster
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) join ( ctx context . Context , clientAccessInfo * clientaccess . Info ) error {
2021-09-15 05:13:31 +00:00
clientCtx , cancel := context . WithTimeout ( ctx , 20 * time . Second )
defer cancel ( )
var (
cluster [ ] string
add = true
)
clientURLs , memberList , err := ClientURLs ( clientCtx , clientAccessInfo , e . config . PrivateIP )
2020-05-05 22:02:16 +00:00
if err != nil {
return err
}
2023-09-21 18:48:21 +00:00
client , err := getClient ( clientCtx , e . config , clientURLs ... )
2020-05-05 22:02:16 +00:00
if err != nil {
return err
}
2020-10-27 18:06:26 +00:00
defer client . Close ( )
2020-05-05 22:02:16 +00:00
2023-04-27 21:13:39 +00:00
for _ , member := range memberList . Members {
2020-05-05 22:02:16 +00:00
for _ , peer := range member . PeerURLs {
u , err := url . Parse ( peer )
if err != nil {
return err
}
2021-12-18 06:26:04 +00:00
// An uninitialized joining member won't have a name; if it has our
// address it must be us.
2020-05-05 22:02:16 +00:00
if member . Name == "" && u . Hostname ( ) == e . address {
member . Name = e . name
}
2021-12-18 06:26:04 +00:00
// If we're already in the cluster, don't try to add ourselves.
if member . Name == e . name && u . Hostname ( ) == e . address {
add = false
}
2020-05-05 22:02:16 +00:00
if len ( member . PeerURLs ) > 0 {
cluster = append ( cluster , fmt . Sprintf ( "%s=%s" , member . Name , member . PeerURLs [ 0 ] ) )
}
}
2021-12-18 06:26:04 +00:00
// Try to get the node name from the member name
memberNodeName := member . Name
if lastHyphen := strings . LastIndex ( member . Name , "-" ) ; lastHyphen > 1 {
memberNodeName = member . Name [ : lastHyphen ]
}
// Make sure there's not already a member in the cluster with a duplicate node name
if member . Name != e . name && memberNodeName == e . config . ServerNodeName {
// make sure to remove the name file if a duplicate node name is used, so that we
// generate a new member name when our node name is fixed.
nameFile := nameFile ( e . config )
if err := os . Remove ( nameFile ) ; err != nil {
logrus . Errorf ( "Failed to remove etcd name file %s: %v" , nameFile , err )
}
return errors . New ( "duplicate node name found, please use a unique name for this node" )
}
2020-05-05 22:02:16 +00:00
}
if add {
2021-10-12 06:13:10 +00:00
logrus . Infof ( "Adding member %s=%s to etcd cluster %v" , e . name , e . peerURL ( ) , cluster )
2021-09-15 05:13:31 +00:00
if _ , err = client . MemberAddAsLearner ( clientCtx , [ ] string { e . peerURL ( ) } ) ; err != nil {
2020-05-05 22:02:16 +00:00
return err
}
cluster = append ( cluster , fmt . Sprintf ( "%s=%s" , e . name , e . peerURL ( ) ) )
}
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd to join cluster with members %v" , cluster )
2020-05-05 22:02:16 +00:00
return e . cluster ( ctx , false , executor . InitialOptions {
Cluster : strings . Join ( cluster , "," ) ,
State : "existing" ,
} )
}
2023-09-21 18:53:50 +00:00
// Register adds db info routes for the http request handler, and registers cluster controller callbacks
func ( e * ETCD ) Register ( handler http . Handler ) ( http . Handler , error ) {
2023-02-08 00:37:10 +00:00
e . config . Runtime . ClusterControllerStarts [ "etcd-node-metadata" ] = func ( ctx context . Context ) {
registerMetadataHandlers ( ctx , e )
2021-09-14 15:20:38 +00:00
}
2023-02-08 00:37:10 +00:00
// The apiserver endpoint controller needs to run on a node with a local apiserver,
2023-02-13 20:00:52 +00:00
// in order to successfully seed etcd with the endpoint list. The member removal controller
// also needs to run on a non-etcd node as to avoid disruption if running on the node that
// is being removed from the cluster.
2023-02-08 00:37:10 +00:00
if ! e . config . DisableAPIServer {
2023-02-13 20:00:52 +00:00
e . config . Runtime . LeaderElectedClusterControllerStarts [ version . Program + "-etcd" ] = func ( ctx context . Context ) {
2023-02-08 00:37:10 +00:00
registerEndpointsHandlers ( ctx , e )
2023-02-13 20:00:52 +00:00
registerMemberHandlers ( ctx , e )
2023-10-03 17:13:26 +00:00
registerSnapshotHandlers ( ctx , e )
2023-02-08 00:37:10 +00:00
}
2020-05-05 22:02:16 +00:00
}
2022-04-06 00:11:24 +00:00
2023-02-13 20:00:52 +00:00
// Tombstone file checking is unnecessary if we're not running etcd.
2023-02-08 00:37:10 +00:00
if ! e . config . DisableETCD {
2023-09-21 18:48:21 +00:00
tombstoneFile := filepath . Join ( dbDir ( e . config ) , "tombstone" )
2023-02-08 00:37:10 +00:00
if _ , err := os . Stat ( tombstoneFile ) ; err == nil {
logrus . Infof ( "tombstone file has been detected, removing data dir to rejoin the cluster" )
2023-09-21 18:48:21 +00:00
if _ , err := backupDirWithRetention ( dbDir ( e . config ) , maxBackupRetention ) ; err != nil {
2023-02-08 00:37:10 +00:00
return nil , err
}
}
if err := e . setName ( false ) ; err != nil {
return nil , err
}
2020-12-07 20:30:44 +00:00
}
2021-09-14 15:20:38 +00:00
2023-02-08 00:37:10 +00:00
return e . handler ( handler ) , nil
2020-05-05 22:02:16 +00:00
}
2020-09-22 03:23:18 +00:00
// setName sets a unique name for this cluster member. The first time this is called,
// or if force is set to true, a new name will be generated and written to disk. The persistent
// name is used on subsequent calls.
2020-09-19 01:09:36 +00:00
func ( e * ETCD ) setName ( force bool ) error {
2020-05-05 22:02:16 +00:00
fileName := nameFile ( e . config )
2022-10-08 00:36:57 +00:00
data , err := os . ReadFile ( fileName )
2020-09-19 01:09:36 +00:00
if os . IsNotExist ( err ) || force {
2021-09-17 22:51:18 +00:00
e . name = e . config . ServerNodeName + "-" + uuid . New ( ) . String ( ) [ : 8 ]
2020-09-24 06:01:35 +00:00
if err := os . MkdirAll ( filepath . Dir ( fileName ) , 0700 ) ; err != nil {
2020-05-05 22:02:16 +00:00
return err
}
2022-10-08 00:36:57 +00:00
return os . WriteFile ( fileName , [ ] byte ( e . name ) , 0600 )
2020-05-05 22:02:16 +00:00
} else if err != nil {
return err
}
e . name = string ( data )
return nil
}
2020-09-24 06:29:25 +00:00
// handler wraps the handler with routes for database info
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) handler ( next http . Handler ) http . Handler {
2022-04-04 21:54:50 +00:00
mux := mux . NewRouter ( ) . SkipClean ( true )
2020-05-05 22:02:16 +00:00
mux . Handle ( "/db/info" , e . infoHandler ( ) )
mux . NotFoundHandler = next
return mux
}
2021-10-12 06:13:10 +00:00
// infoHandler returns etcd cluster information. This is used by new members when joining the cluster.
2023-04-27 21:13:39 +00:00
// If we can't retrieve an actual MemberList from etcd, we return a canned response with only the local node listed.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) infoHandler ( ) http . Handler {
return http . HandlerFunc ( func ( rw http . ResponseWriter , req * http . Request ) {
ctx , cancel := context . WithTimeout ( req . Context ( ) , 2 * time . Second )
defer cancel ( )
members , err := e . client . MemberList ( ctx )
if err != nil {
2023-04-27 21:13:39 +00:00
logrus . Warnf ( "Failed to get etcd MemberList for %s: %v" , req . RemoteAddr , err )
members = & clientv3 . MemberListResponse {
2020-05-05 22:02:16 +00:00
Members : [ ] * etcdserverpb . Member {
{
Name : e . name ,
PeerURLs : [ ] string { e . peerURL ( ) } ,
ClientURLs : [ ] string { e . clientURL ( ) } ,
} ,
} ,
2023-04-27 21:13:39 +00:00
}
2020-05-05 22:02:16 +00:00
}
rw . Header ( ) . Set ( "Content-Type" , "application/json" )
json . NewEncoder ( rw ) . Encode ( & Members {
Members : members . Members ,
} )
} )
}
2023-09-21 18:48:21 +00:00
// getClient returns an etcd client connected to the specified endpoints.
2022-02-24 22:35:08 +00:00
// If no endpoints are provided, endpoints are retrieved from the provided runtime config.
// If the runtime config does not list any endpoints, the default endpoint is used.
// The returned client should be closed when no longer needed, in order to avoid leaking GRPC
// client goroutines.
2023-09-21 18:48:21 +00:00
func getClient ( ctx context . Context , control * config . Control , endpoints ... string ) ( * clientv3 . Client , error ) {
2022-04-12 16:59:47 +00:00
cfg , err := getClientConfig ( ctx , control , endpoints ... )
2020-05-05 22:02:16 +00:00
if err != nil {
return nil , err
}
2021-11-10 12:33:42 +00:00
2021-07-02 19:55:47 +00:00
return clientv3 . New ( * cfg )
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
// getClientConfig generates an etcd client config connected to the specified endpoints.
// If no endpoints are provided, getEndpoints is called to provide defaults.
2022-04-12 16:59:47 +00:00
func getClientConfig ( ctx context . Context , control * config . Control , endpoints ... string ) ( * clientv3 . Config , error ) {
runtime := control . Runtime
2022-02-24 22:35:08 +00:00
if len ( endpoints ) == 0 {
2022-04-12 16:59:47 +00:00
endpoints = getEndpoints ( control )
2022-02-24 22:35:08 +00:00
}
2022-03-29 18:45:21 +00:00
config := & clientv3 . Config {
2020-09-22 03:23:18 +00:00
Endpoints : endpoints ,
Context : ctx ,
DialTimeout : defaultDialTimeout ,
DialKeepAliveTime : defaultKeepAliveTime ,
2020-10-27 18:06:26 +00:00
DialKeepAliveTimeout : defaultKeepAliveTimeout ,
2023-10-18 00:13:07 +00:00
AutoSyncInterval : defaultKeepAliveTimeout ,
2023-02-13 20:00:52 +00:00
PermitWithoutStream : true ,
2022-03-29 18:45:21 +00:00
}
var err error
if strings . HasPrefix ( endpoints [ 0 ] , "https://" ) {
config . TLS , err = toTLSConfig ( runtime )
}
return config , err
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
// getEndpoints returns the endpoints from the runtime config if set, otherwise the default endpoint.
2022-04-12 16:59:47 +00:00
func getEndpoints ( control * config . Control ) [ ] string {
runtime := control . Runtime
2022-02-24 22:35:08 +00:00
if len ( runtime . EtcdConfig . Endpoints ) > 0 {
return runtime . EtcdConfig . Endpoints
}
2022-07-21 21:40:09 +00:00
return [ ] string { fmt . Sprintf ( "https://%s:2379" , control . Loopback ( true ) ) }
2022-02-24 22:35:08 +00:00
}
2020-09-22 03:23:18 +00:00
// toTLSConfig converts the ControlRuntime configuration to TLS configuration suitable
// for use by etcd.
2020-05-05 22:02:16 +00:00
func toTLSConfig ( runtime * config . ControlRuntime ) ( * tls . Config , error ) {
2021-10-12 06:13:10 +00:00
if runtime . ClientETCDCert == "" || runtime . ClientETCDKey == "" || runtime . ETCDServerCA == "" {
return nil , errors . New ( "runtime is not ready yet" )
}
2020-05-05 22:02:16 +00:00
clientCert , err := tls . LoadX509KeyPair ( runtime . ClientETCDCert , runtime . ClientETCDKey )
if err != nil {
return nil , err
}
pool , err := certutil . NewPool ( runtime . ETCDServerCA )
if err != nil {
return nil , err
}
return & tls . Config {
RootCAs : pool ,
Certificates : [ ] tls . Certificate { clientCert } ,
} , nil
}
2020-09-22 03:23:18 +00:00
// getAdvertiseAddress returns the IP address best suited for advertising to clients
2023-03-24 22:19:44 +00:00
func getAdvertiseAddress ( advertiseIP string ) ( string , error ) {
2020-05-05 22:02:16 +00:00
ip := advertiseIP
if ip == "" {
ipAddr , err := utilnet . ChooseHostInterface ( )
if err != nil {
return "" , err
}
ip = ipAddr . String ( )
}
return ip , nil
}
2020-09-22 03:23:18 +00:00
// newCluster returns options to set up etcd for a new cluster
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) newCluster ( ctx context . Context , reset bool ) error {
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd for new cluster" )
2021-04-26 16:47:53 +00:00
err := e . cluster ( ctx , reset , executor . InitialOptions {
2021-10-12 06:13:10 +00:00
AdvertisePeerURL : e . peerURL ( ) ,
Cluster : fmt . Sprintf ( "%s=%s" , e . name , e . peerURL ( ) ) ,
2020-05-05 22:02:16 +00:00
State : "new" ,
} )
2021-04-26 16:47:53 +00:00
if err != nil {
return err
}
if err := e . migrateFromSQLite ( ctx ) ; err != nil {
return fmt . Errorf ( "failed to migrate content from sqlite to etcd: %w" , err )
}
return nil
}
func ( e * ETCD ) migrateFromSQLite ( ctx context . Context ) error {
_ , err := os . Stat ( sqliteFile ( e . config ) )
if os . IsNotExist ( err ) {
return nil
} else if err != nil {
return err
}
logrus . Infof ( "Migrating content from sqlite to etcd" )
ctx , cancel := context . WithCancel ( ctx )
defer cancel ( )
_ , err = endpoint2 . Listen ( ctx , endpoint2 . Config {
Endpoint : endpoint2 . SQLiteBackend ,
} )
if err != nil {
return err
}
sqliteClient , err := client . New ( endpoint2 . ETCDConfig {
Endpoints : [ ] string { "unix://kine.sock" } ,
} )
if err != nil {
return err
}
defer sqliteClient . Close ( )
2023-09-21 18:48:21 +00:00
etcdClient , err := getClient ( ctx , e . config )
2021-04-26 16:47:53 +00:00
if err != nil {
return err
}
defer etcdClient . Close ( )
values , err := sqliteClient . List ( ctx , "/registry/" , 0 )
if err != nil {
return err
}
for _ , value := range values {
logrus . Infof ( "Migrating etcd key %s" , value . Key )
_ , err := etcdClient . Put ( ctx , string ( value . Key ) , string ( value . Data ) )
if err != nil {
return err
}
}
return os . Rename ( sqliteFile ( e . config ) , sqliteFile ( e . config ) + ".migrated" )
2020-05-05 22:02:16 +00:00
}
2022-05-05 08:10:08 +00:00
// peerURL returns the external peer access address for the local node.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) peerURL ( ) string {
2022-04-15 00:31:49 +00:00
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . address , "2380" ) )
2020-05-05 22:02:16 +00:00
}
2022-05-05 08:10:08 +00:00
// listenClientURLs returns a list of URLs to bind to for peer connections.
// During cluster reset/restore, we only listen on loopback to avoid having peers
// connect mid-process.
func ( e * ETCD ) listenPeerURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
peerURLs := fmt . Sprintf ( "https://%s:2380" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset {
peerURLs += "," + e . peerURL ( )
}
return peerURLs
}
// clientURL returns the external client access address for the local node.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) clientURL ( ) string {
2022-04-15 00:31:49 +00:00
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . address , "2379" ) )
2020-05-05 22:02:16 +00:00
}
2023-03-24 22:19:44 +00:00
// advertiseClientURLs returns the advertised addresses for the local node.
// During cluster reset/restore we only listen on loopback to avoid having apiservers
// on other nodes connect mid-process.
func ( e * ETCD ) advertiseClientURLs ( reset bool ) string {
if reset {
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . config . Loopback ( true ) , "2379" ) )
}
return e . clientURL ( )
}
2022-05-05 08:10:08 +00:00
// listenClientURLs returns a list of URLs to bind to for client connections.
2023-03-24 22:19:44 +00:00
// During cluster reset/restore, we only listen on loopback to avoid having apiservers
// on other nodes connect mid-process.
2022-05-05 08:10:08 +00:00
func ( e * ETCD ) listenClientURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
clientURLs := fmt . Sprintf ( "https://%s:2379" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset {
clientURLs += "," + e . clientURL ( )
}
return clientURLs
}
// listenMetricsURLs returns a list of URLs to bind to for metrics connections.
func ( e * ETCD ) listenMetricsURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
metricsURLs := fmt . Sprintf ( "http://%s:2381" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset && e . config . EtcdExposeMetrics {
metricsURLs += "," + fmt . Sprintf ( "http://%s" , net . JoinHostPort ( e . address , "2381" ) )
2021-01-23 01:40:48 +00:00
}
2022-05-05 08:10:08 +00:00
return metricsURLs
2021-01-23 01:40:48 +00:00
}
2023-09-22 06:54:03 +00:00
// listenClientHTTPURLs returns a list of URLs to bind to for http client connections.
// This should no longer be used, but we must set it in order to free the listen URLs
// for dedicated use by GRPC.
// Ref: https://github.com/etcd-io/etcd/issues/15402
func ( e * ETCD ) listenClientHTTPURLs ( ) string {
return fmt . Sprintf ( "https://%s:2382" , e . config . Loopback ( true ) )
}
2022-05-05 08:10:08 +00:00
// cluster calls the executor to start etcd running with the provided configuration.
func ( e * ETCD ) cluster ( ctx context . Context , reset bool , options executor . InitialOptions ) error {
2022-04-27 20:44:15 +00:00
ctx , e . cancel = context . WithCancel ( ctx )
2021-09-08 17:56:18 +00:00
return executor . ETCD ( ctx , executor . ETCDConfig {
2020-05-05 22:02:16 +00:00
Name : e . name ,
InitialOptions : options ,
2022-05-05 08:10:08 +00:00
ForceNewCluster : reset ,
ListenClientURLs : e . listenClientURLs ( reset ) ,
ListenMetricsURLs : e . listenMetricsURLs ( reset ) ,
ListenPeerURLs : e . listenPeerURLs ( reset ) ,
2023-03-24 22:19:44 +00:00
AdvertiseClientURLs : e . advertiseClientURLs ( reset ) ,
2023-09-21 18:48:21 +00:00
DataDir : dbDir ( e . config ) ,
2020-05-05 22:02:16 +00:00
ServerTrust : executor . ServerTrust {
CertFile : e . config . Runtime . ServerETCDCert ,
KeyFile : e . config . Runtime . ServerETCDKey ,
ClientCertAuth : true ,
TrustedCAFile : e . config . Runtime . ETCDServerCA ,
} ,
PeerTrust : executor . PeerTrust {
CertFile : e . config . Runtime . PeerServerClientETCDCert ,
KeyFile : e . config . Runtime . PeerServerClientETCDKey ,
ClientCertAuth : true ,
TrustedCAFile : e . config . Runtime . ETCDPeerCA ,
} ,
2022-05-05 08:10:08 +00:00
SnapshotCount : 10000 ,
2022-04-27 20:44:15 +00:00
ElectionTimeout : 5000 ,
HeartbeatInterval : 500 ,
Logger : "zap" ,
LogOutputs : [ ] string { "stderr" } ,
ExperimentalInitialCorruptCheck : true ,
2023-09-22 06:54:03 +00:00
ListenClientHTTPURLs : e . listenClientHTTPURLs ( ) ,
2021-11-12 05:03:15 +00:00
} , e . config . ExtraEtcdArgs )
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
func ( e * ETCD ) StartEmbeddedTemporary ( ctx context . Context ) error {
2023-09-21 18:48:21 +00:00
etcdDataDir := dbDir ( e . config )
2022-02-24 22:35:08 +00:00
tmpDataDir := etcdDataDir + "-tmp"
os . RemoveAll ( tmpDataDir )
2022-03-25 18:52:40 +00:00
go func ( ) {
<- ctx . Done ( )
2022-02-24 22:35:08 +00:00
if err := os . RemoveAll ( tmpDataDir ) ; err != nil {
logrus . Warnf ( "Failed to remove etcd temp dir: %v" , err )
}
} ( )
2023-09-21 18:53:50 +00:00
if e . client != nil {
return errors . New ( "etcd datastore already started" )
}
client , err := getClient ( ctx , e . config )
if err != nil {
return err
}
e . client = client
go func ( ) {
<- ctx . Done ( )
client := e . client
e . client = nil
client . Close ( )
} ( )
2022-02-24 22:35:08 +00:00
if err := cp . Copy ( etcdDataDir , tmpDataDir , cp . Options { PreserveOwner : true } ) ; err != nil {
return err
}
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
clientURL := endpoints [ 0 ]
2023-09-22 06:54:03 +00:00
// peer URL is usually 1 more than client
2022-02-24 22:35:08 +00:00
peerURL , err := addPort ( endpoints [ 0 ] , 1 )
if err != nil {
return err
}
2023-09-22 06:54:03 +00:00
// client http URL is usually 3 more than client, after peer and metrics
clientHTTPURL , err := addPort ( endpoints [ 0 ] , 3 )
if err != nil {
return err
}
2022-02-24 22:35:08 +00:00
embedded := executor . Embedded { }
2022-04-27 20:44:15 +00:00
ctx , e . cancel = context . WithCancel ( ctx )
2022-02-24 22:35:08 +00:00
return embedded . ETCD ( ctx , executor . ETCDConfig {
2022-04-27 20:44:15 +00:00
InitialOptions : executor . InitialOptions { AdvertisePeerURL : peerURL } ,
DataDir : tmpDataDir ,
ForceNewCluster : true ,
AdvertiseClientURLs : clientURL ,
ListenClientURLs : clientURL ,
2023-09-22 06:54:03 +00:00
ListenClientHTTPURLs : clientHTTPURL ,
2022-04-27 20:44:15 +00:00
ListenPeerURLs : peerURL ,
Logger : "zap" ,
HeartbeatInterval : 500 ,
ElectionTimeout : 5000 ,
2022-05-05 08:10:08 +00:00
SnapshotCount : 10000 ,
2022-04-27 20:44:15 +00:00
Name : e . name ,
LogOutputs : [ ] string { "stderr" } ,
ExperimentalInitialCorruptCheck : true ,
2023-04-27 21:13:39 +00:00
} , append ( e . config . ExtraEtcdArgs , "--max-snapshots=0" , "--max-wals=0" ) )
2022-02-24 22:35:08 +00:00
}
func addPort ( address string , offset int ) ( string , error ) {
u , err := url . Parse ( address )
if err != nil {
return "" , err
}
port , err := strconv . Atoi ( u . Port ( ) )
if err != nil {
return "" , err
}
port += offset
return fmt . Sprintf ( "%s://%s:%d" , u . Scheme , u . Hostname ( ) , port ) , nil
}
2021-09-14 15:20:38 +00:00
// RemovePeer removes a peer from the cluster. The peer name and IP address must both match.
func ( e * ETCD ) RemovePeer ( ctx context . Context , name , address string , allowSelfRemoval bool ) error {
2021-08-05 20:32:01 +00:00
ctx , cancel := context . WithTimeout ( ctx , memberRemovalTimeout )
defer cancel ( )
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return err
}
for _ , member := range members . Members {
2021-09-14 15:20:38 +00:00
if member . Name != name {
2020-05-05 22:02:16 +00:00
continue
}
for _ , peerURL := range member . PeerURLs {
u , err := url . Parse ( peerURL )
if err != nil {
return err
}
if u . Hostname ( ) == address {
2021-09-14 15:20:38 +00:00
if e . address == address && ! allowSelfRemoval {
return errors . New ( "not removing self from etcd cluster" )
2020-10-28 16:32:51 +00:00
}
2020-05-05 22:02:16 +00:00
logrus . Infof ( "Removing name=%s id=%d address=%s from etcd" , member . Name , member . ID , address )
_ , err := e . client . MemberRemove ( ctx , member . ID )
2023-04-05 00:52:14 +00:00
if errors . Is ( err , rpctypes . ErrGRPCMemberNotFound ) {
2020-10-28 16:32:51 +00:00
return nil
}
2020-05-05 22:02:16 +00:00
return err
}
}
}
return nil
}
2020-07-29 20:52:49 +00:00
2020-10-27 18:06:26 +00:00
// manageLearners monitors the etcd cluster to ensure that learners are making progress towards
// being promoted to full voting member. The checks only run on the cluster member that is
// the etcd leader.
2022-02-23 21:52:46 +00:00
func ( e * ETCD ) manageLearners ( ctx context . Context ) {
2022-02-24 19:01:14 +00:00
<- e . config . Runtime . AgentReady
2020-10-27 18:06:26 +00:00
t := time . NewTicker ( manageTickerTime )
2020-07-29 20:52:49 +00:00
defer t . Stop ( )
2020-10-27 18:06:26 +00:00
2020-07-29 20:52:49 +00:00
for range t . C {
2022-07-20 00:21:23 +00:00
ctx , cancel := context . WithTimeout ( ctx , manageTickerTime )
2020-10-27 18:06:26 +00:00
defer cancel ( )
// Check to see if the local node is the leader. Only the leader should do learner management.
2021-07-26 16:59:33 +00:00
if e . client == nil {
2022-02-23 21:52:46 +00:00
logrus . Debug ( "Etcd client was nil" )
2021-07-26 16:59:33 +00:00
continue
}
2023-12-21 00:23:27 +00:00
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
if status , err := e . client . Status ( ctx , endpoints [ 0 ] ) ; err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to check local etcd status for learner management: %v" , err )
continue
} else if status . Header . MemberId != status . Leader {
continue
}
progress , err := e . getLearnerProgress ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to get recorded learner progress from etcd: %v" , err )
2020-07-29 20:52:49 +00:00
continue
}
2020-10-27 18:06:26 +00:00
members , err := e . client . MemberList ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to get etcd members for learner management: %v" , err )
2020-07-29 20:52:49 +00:00
continue
}
2020-10-27 18:06:26 +00:00
2023-11-16 23:58:42 +00:00
client , err := util . GetClientSet ( e . config . Runtime . KubeConfigSupervisor )
2023-11-13 14:39:24 +00:00
if err != nil {
2023-11-16 23:58:42 +00:00
logrus . Errorf ( "Failed to get k8s client for patch node status condition: %v" , err )
2023-11-13 14:39:24 +00:00
continue
}
2023-11-16 23:58:42 +00:00
nodes , err := e . getETCDNodes ( )
2023-11-13 14:39:24 +00:00
if err != nil {
2023-11-16 23:58:42 +00:00
logrus . Warnf ( "Failed to list nodes with etcd role: %v" , err )
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
// a map to track if a node is a member of the etcd cluster or not
nodeIsMember := make ( map [ string ] bool )
nodesMap := make ( map [ string ] * v1 . Node )
for _ , node := range nodes {
nodeIsMember [ node . Name ] = false
nodesMap [ node . Name ] = node
}
2020-07-29 20:52:49 +00:00
for _ , member := range members . Members {
2024-01-10 19:37:56 +00:00
status := StatusVoter
message := ""
2023-12-21 00:23:27 +00:00
2020-10-27 18:06:26 +00:00
if member . IsLearner {
2024-01-10 19:37:56 +00:00
status = StatusLearner
2020-10-27 18:06:26 +00:00
if err := e . trackLearnerProgress ( ctx , progress , member ) ; err != nil {
logrus . Errorf ( "Failed to track learner progress towards promotion: %v" , err )
}
2024-01-10 19:37:56 +00:00
}
2023-12-21 00:23:27 +00:00
2024-01-10 19:37:56 +00:00
var node * v1 . Node
for _ , n := range nodes {
if strings . HasPrefix ( member . Name , n . Name + "-" ) {
node = n
nodeIsMember [ n . Name ] = true
break
2023-11-13 14:39:24 +00:00
}
2024-01-10 19:37:56 +00:00
}
if node == nil {
continue
2020-07-29 20:52:49 +00:00
}
2023-11-13 14:39:24 +00:00
2024-01-10 19:37:56 +00:00
// verify if the member is healthy and set the status
2023-12-21 00:23:27 +00:00
if _ , err := e . getETCDStatus ( ctx , member . ClientURLs [ 0 ] ) ; err != nil {
2024-01-10 19:37:56 +00:00
message = err . Error ( )
status = StatusUnhealthy
2023-12-21 00:23:27 +00:00
}
2024-01-10 19:37:56 +00:00
if err := e . setEtcdStatusCondition ( node , client , member . Name , status , message ) ; err != nil {
2023-12-21 00:23:27 +00:00
logrus . Errorf ( "Unable to set etcd status condition %s: %v" , member . Name , err )
}
}
2024-01-10 19:37:56 +00:00
for nodeName , node := range nodesMap {
if ! nodeIsMember [ nodeName ] {
2023-12-21 00:23:27 +00:00
if err := e . setEtcdStatusCondition ( node , client , nodeName , StatusUnjoined , "" ) ; err != nil {
logrus . Errorf ( "Unable to set etcd status condition for a node that is not a cluster member %s: %v" , nodeName , err )
2023-11-13 14:39:24 +00:00
}
}
2020-07-29 20:52:49 +00:00
}
}
2023-11-13 14:39:24 +00:00
}
2023-11-16 23:58:42 +00:00
func ( e * ETCD ) getETCDNodes ( ) ( [ ] * v1 . Node , error ) {
if e . config . Runtime . Core == nil {
return nil , errors . New ( "runtime core not ready" )
}
2023-11-13 14:39:24 +00:00
nodes := e . config . Runtime . Core . Core ( ) . V1 ( ) . Node ( )
etcdSelector := labels . Set { util . ETCDRoleLabelKey : "true" }
2023-11-16 23:58:42 +00:00
return nodes . Cache ( ) . List ( etcdSelector . AsSelector ( ) )
2020-07-29 20:52:49 +00:00
}
2020-10-27 18:06:26 +00:00
// trackLearnerProcess attempts to promote a learner. If it cannot be promoted, progress through the raft index is tracked.
// If the learner does not make any progress in a reasonable amount of time, it is evicted from the cluster.
func ( e * ETCD ) trackLearnerProgress ( ctx context . Context , progress * learnerProgress , member * etcdserverpb . Member ) error {
// Try to promote it. If it can be promoted, no further tracking is necessary
if _ , err := e . client . MemberPromote ( ctx , member . ID ) ; err != nil {
logrus . Debugf ( "Unable to promote learner %s: %v" , member . Name , err )
} else {
logrus . Infof ( "Promoted learner %s" , member . Name )
return nil
}
now := time . Now ( )
// If this is the first time we've tracked this member's progress, reset stats
if progress . Name != member . Name || progress . ID != member . ID {
progress . ID = member . ID
progress . Name = member . Name
progress . RaftAppliedIndex = 0
progress . LastProgress . Time = now
}
// Update progress by retrieving status from the member's first reachable client URL
for _ , ep := range member . ClientURLs {
2023-12-21 00:23:27 +00:00
status , err := e . getETCDStatus ( ctx , ep )
2020-10-27 18:06:26 +00:00
if err != nil {
logrus . Debugf ( "Failed to get etcd status from learner %s at %s: %v" , member . Name , ep , err )
continue
}
if progress . RaftAppliedIndex < status . RaftAppliedIndex {
logrus . Debugf ( "Learner %s has progressed from RaftAppliedIndex %d to %d" , progress . Name , progress . RaftAppliedIndex , status . RaftAppliedIndex )
progress . RaftAppliedIndex = status . RaftAppliedIndex
progress . LastProgress . Time = now
}
break
}
// Warn if the learner hasn't made any progress
if ! progress . LastProgress . Time . Equal ( now ) {
logrus . Warnf ( "Learner %s stalled at RaftAppliedIndex=%d for %s" , progress . Name , progress . RaftAppliedIndex , now . Sub ( progress . LastProgress . Time ) . String ( ) )
}
// See if it's time to evict yet
if now . Sub ( progress . LastProgress . Time ) > learnerMaxStallTime {
if _ , err := e . client . MemberRemove ( ctx , member . ID ) ; err != nil {
return err
}
logrus . Warnf ( "Removed learner %s from etcd cluster" , member . Name )
return nil
}
return e . setLearnerProgress ( ctx , progress )
}
2023-12-21 00:23:27 +00:00
func ( e * ETCD ) getETCDStatus ( ctx context . Context , url string ) ( * clientv3 . StatusResponse , error ) {
ctx , cancel := context . WithTimeout ( ctx , defaultDialTimeout )
defer cancel ( )
resp , err := e . client . Status ( ctx , url )
if err != nil {
return resp , errors . Wrap ( err , "failed to check etcd member status" )
}
if len ( resp . Errors ) != 0 {
return resp , errors . New ( "etcd member has status errors: " + strings . Join ( resp . Errors , "," ) )
}
return resp , nil
}
2023-11-13 14:39:24 +00:00
2023-12-21 00:23:27 +00:00
func ( e * ETCD ) setEtcdStatusCondition ( node * v1 . Node , client kubernetes . Interface , memberName string , memberStatus MemberStatus , message string ) error {
2023-11-13 14:39:24 +00:00
var newCondition v1 . NodeCondition
2023-12-21 00:23:27 +00:00
switch memberStatus {
case StatusLearner :
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "False" ,
Reason : "MemberIsLearner" ,
Message : "Node has not been promoted to voting member of the etcd cluster" ,
}
case StatusVoter :
2023-11-13 14:39:24 +00:00
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "True" ,
Reason : "MemberNotLearner" ,
Message : "Node is a voting member of the etcd cluster" ,
}
2023-12-21 00:23:27 +00:00
case StatusUnhealthy :
2023-11-13 14:39:24 +00:00
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "False" ,
2023-12-21 00:23:27 +00:00
Reason : "Unhealthy" ,
Message : "Node is unhealthy" ,
}
case StatusUnjoined :
newCondition = v1 . NodeCondition {
Type : etcdStatusType ,
Status : "False" ,
Reason : "NotAMember" ,
Message : "Node is not a member of the etcd cluster" ,
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
default :
logrus . Warnf ( "Unknown etcd member status %s" , memberStatus )
return nil
}
if message != "" {
newCondition . Message = message
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
if find , condition := nodeUtil . GetNodeCondition ( & node . Status , etcdStatusType ) ; find >= 0 {
2024-01-18 05:00:18 +00:00
// if the condition is not changing, we only want to update the last heartbeat time
if condition . Status == newCondition . Status && condition . Reason == newCondition . Reason && condition . Message == newCondition . Message {
2023-12-21 00:23:27 +00:00
logrus . Debugf ( "Node %s is not changing etcd status condition" , memberName )
2024-01-18 05:00:18 +00:00
// If the condition status is not changing, we only want to update the last heartbeat time if the
// LastHeartbeatTime is older than the heartbeatTimeout.
if metav1 . Now ( ) . Sub ( condition . LastHeartbeatTime . Time ) < heartbeatInterval {
return nil
}
2023-11-13 14:39:24 +00:00
condition . LastHeartbeatTime = metav1 . Now ( )
2023-12-21 00:23:27 +00:00
return nodeHelper . SetNodeCondition ( client , types . NodeName ( node . Name ) , * condition )
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
logrus . Debugf ( "Node %s is changing etcd status condition" , memberName )
2023-11-13 14:39:24 +00:00
condition = & newCondition
condition . LastHeartbeatTime = metav1 . Now ( )
condition . LastTransitionTime = metav1 . Now ( )
2023-12-21 00:23:27 +00:00
return nodeHelper . SetNodeCondition ( client , types . NodeName ( node . Name ) , * condition )
2023-11-13 14:39:24 +00:00
}
2023-12-21 00:23:27 +00:00
logrus . Infof ( "Adding node %s etcd status condition" , memberName )
2023-11-13 14:39:24 +00:00
newCondition . LastHeartbeatTime = metav1 . Now ( )
newCondition . LastTransitionTime = metav1 . Now ( )
2023-12-21 00:23:27 +00:00
return nodeHelper . SetNodeCondition ( client , types . NodeName ( node . Name ) , newCondition )
2023-11-13 14:39:24 +00:00
}
2020-10-27 18:06:26 +00:00
// getLearnerProgress returns the stored learnerProgress struct as retrieved from etcd
func ( e * ETCD ) getLearnerProgress ( ctx context . Context ) ( * learnerProgress , error ) {
progress := & learnerProgress { }
value , err := e . client . Get ( ctx , learnerProgressKey )
if err != nil {
return nil , err
}
if value . Count < 1 {
return progress , nil
}
if err := json . NewDecoder ( bytes . NewBuffer ( value . Kvs [ 0 ] . Value ) ) . Decode ( progress ) ; err != nil {
return nil , err
}
return progress , nil
}
// setLearnerProgress stores the learnerProgress struct to etcd
func ( e * ETCD ) setLearnerProgress ( ctx context . Context , status * learnerProgress ) error {
w := & bytes . Buffer { }
if err := json . NewEncoder ( w ) . Encode ( status ) ; err != nil {
return err
}
_ , err := e . client . Put ( ctx , learnerProgressKey , w . String ( ) )
return err
}
2022-02-23 21:52:46 +00:00
// clearAlarms checks for any alarms on the local etcd member. If found, they are
// reported and the alarm state is cleared.
func ( e * ETCD ) clearAlarms ( ctx context . Context ) error {
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
defer cancel ( )
if e . client == nil {
return errors . New ( "etcd client was nil" )
}
alarmList , err := e . client . AlarmList ( ctx )
if err != nil {
return fmt . Errorf ( "etcd alarm list failed: %v" , err )
}
for _ , alarm := range alarmList . Alarms {
2022-03-25 18:52:40 +00:00
logrus . Warnf ( "Alarm on etcd member %d: %s" , alarm . MemberID , alarm . Alarm )
2022-02-23 21:52:46 +00:00
}
2022-03-25 18:52:40 +00:00
if len ( alarmList . Alarms ) > 0 {
2022-02-23 21:52:46 +00:00
if _ , err := e . client . AlarmDisarm ( ctx , & clientv3 . AlarmMember { } ) ; err != nil {
return fmt . Errorf ( "etcd alarm disarm failed: %v" , err )
}
logrus . Infof ( "Alarms disarmed on etcd server" )
}
return nil
}
2022-03-25 18:52:40 +00:00
func ( e * ETCD ) defragment ( ctx context . Context ) error {
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
defer cancel ( )
if e . client == nil {
return errors . New ( "etcd client was nil" )
}
logrus . Infof ( "Defragmenting etcd database" )
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-03-25 18:52:40 +00:00
_ , err := e . client . Defragment ( ctx , endpoints [ 0 ] )
return err
}
2022-02-24 22:35:08 +00:00
// clientURLs returns a list of all non-learner etcd cluster member client access URLs.
// The list is retrieved from the remote server that is being joined.
2021-03-01 21:50:50 +00:00
func ClientURLs ( ctx context . Context , clientAccessInfo * clientaccess . Info , selfIP string ) ( [ ] string , Members , error ) {
2020-07-29 20:52:49 +00:00
var memberList Members
2021-03-06 10:29:57 +00:00
resp , err := clientAccessInfo . Get ( "/db/info" )
2020-07-29 20:52:49 +00:00
if err != nil {
return nil , memberList , err
}
if err := json . Unmarshal ( resp , & memberList ) ; err != nil {
return nil , memberList , err
}
2023-03-24 22:19:44 +00:00
ip , err := getAdvertiseAddress ( selfIP )
2021-03-01 21:50:50 +00:00
if err != nil {
return nil , memberList , err
}
2020-07-29 20:52:49 +00:00
var clientURLs [ ] string
2021-03-01 21:50:50 +00:00
members :
2020-07-29 20:52:49 +00:00
for _ , member := range memberList . Members {
// excluding learner member from the client list
if member . IsLearner {
continue
}
2021-08-12 22:59:04 +00:00
for _ , clientURL := range member . ClientURLs {
u , err := url . Parse ( clientURL )
if err != nil {
continue
}
if u . Hostname ( ) == ip {
2021-03-01 21:50:50 +00:00
continue members
}
}
2020-07-29 20:52:49 +00:00
clientURLs = append ( clientURLs , member . ClientURLs ... )
}
return clientURLs , memberList , nil
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// Restore performs a restore of the ETCD datastore from
// the given snapshot path. This operation exists upon
// completion.
func ( e * ETCD ) Restore ( ctx context . Context ) error {
// check the old etcd data dir
2023-09-21 18:48:21 +00:00
oldDataDir := dbDir ( e . config ) + "-old-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
2020-09-30 00:53:31 +00:00
if e . config . ClusterResetRestorePath == "" {
return errors . New ( "no etcd restore path was specified" )
}
// make sure snapshot exists before restoration
if _ , err := os . Stat ( e . config . ClusterResetRestorePath ) ; err != nil {
return err
}
2022-01-14 17:31:22 +00:00
var restorePath string
if strings . HasSuffix ( e . config . ClusterResetRestorePath , compressedExtension ) {
snapshotDir , err := snapshotDir ( e . config , true )
if err != nil {
return errors . Wrap ( err , "failed to get the snapshot dir" )
}
decompressSnapshot , err := e . decompressSnapshot ( snapshotDir , e . config . ClusterResetRestorePath )
if err != nil {
return err
}
restorePath = decompressSnapshot
} else {
restorePath = e . config . ClusterResetRestorePath
}
2020-09-30 00:53:31 +00:00
// move the data directory to a temp path
2023-09-21 18:48:21 +00:00
if err := os . Rename ( dbDir ( e . config ) , oldDataDir ) ; err != nil {
2020-09-30 00:53:31 +00:00
return err
}
2022-01-14 17:31:22 +00:00
2020-09-30 00:53:31 +00:00
logrus . Infof ( "Pre-restore etcd database moved to %s" , oldDataDir )
2022-01-14 17:31:22 +00:00
2022-04-13 23:22:07 +00:00
lg , err := logutil . CreateDefaultZapLogger ( zap . InfoLevel )
if err != nil {
return err
}
return snapshot . NewV3 ( lg ) . Restore ( snapshot . RestoreConfig {
2022-01-14 17:31:22 +00:00
SnapshotPath : restorePath ,
2020-09-30 00:53:31 +00:00
Name : e . name ,
2023-09-21 18:48:21 +00:00
OutputDataDir : dbDir ( e . config ) ,
2020-09-30 00:53:31 +00:00
OutputWALDir : walDir ( e . config ) ,
PeerURLs : [ ] string { e . peerURL ( ) } ,
InitialCluster : e . name + "=" + e . peerURL ( ) ,
2021-07-03 11:24:58 +00:00
} )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-12-07 20:30:44 +00:00
// backupDirWithRetention will move the dir to a backup dir
// and will keep only maxBackupRetention of dirs.
func backupDirWithRetention ( dir string , maxBackupRetention int ) ( string , error ) {
backupDir := dir + "-backup-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
if _ , err := os . Stat ( dir ) ; err != nil {
return "" , nil
}
2022-10-08 00:36:57 +00:00
entries , err := os . ReadDir ( filepath . Dir ( dir ) )
if err != nil {
return "" , err
}
files := make ( [ ] fs . FileInfo , 0 , len ( entries ) )
for _ , entry := range entries {
info , err := entry . Info ( )
if err != nil {
return "" , err
}
files = append ( files , info )
}
2020-12-07 20:30:44 +00:00
if err != nil {
return "" , err
}
sort . Slice ( files , func ( i , j int ) bool {
return files [ i ] . ModTime ( ) . After ( files [ j ] . ModTime ( ) )
} )
count := 0
for _ , f := range files {
if strings . HasPrefix ( f . Name ( ) , filepath . Base ( dir ) + "-backup" ) && f . IsDir ( ) {
count ++
if count > maxBackupRetention {
if err := os . RemoveAll ( filepath . Join ( filepath . Dir ( dir ) , f . Name ( ) ) ) ; err != nil {
return "" , err
}
}
}
}
// move the directory to a temp path
if err := os . Rename ( dir , backupDir ) ; err != nil {
return "" , err
}
return backupDir , nil
}
2021-02-12 15:35:57 +00:00
2022-02-16 22:19:58 +00:00
// GetAPIServerURLsFromETCD will try to fetch the version.Program/apiaddresses key from etcd
2022-02-24 22:35:08 +00:00
// and unmarshal it to a list of apiserver endpoints.
2022-02-16 22:19:58 +00:00
func GetAPIServerURLsFromETCD ( ctx context . Context , cfg * config . Control ) ( [ ] string , error ) {
2023-09-21 18:48:21 +00:00
cl , err := getClient ( ctx , cfg )
2021-02-12 15:35:57 +00:00
if err != nil {
2022-02-16 22:19:58 +00:00
return nil , err
2021-02-12 15:35:57 +00:00
}
2022-03-10 22:03:02 +00:00
defer cl . Close ( )
2021-02-12 15:35:57 +00:00
etcdResp , err := cl . KV . Get ( ctx , AddressKey )
if err != nil {
2022-02-16 22:19:58 +00:00
return nil , err
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
if etcdResp . Count == 0 || len ( etcdResp . Kvs [ 0 ] . Value ) == 0 {
return nil , ErrAddressNotSet
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
2021-02-12 15:35:57 +00:00
var addresses [ ] string
if err := json . Unmarshal ( etcdResp . Kvs [ 0 ] . Value , & addresses ) ; err != nil {
2022-02-16 22:19:58 +00:00
return nil , fmt . Errorf ( "failed to unmarshal apiserver addresses from etcd: %v" , err )
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
return addresses , nil
2021-02-12 15:35:57 +00:00
}
// GetMembersClientURLs will list through the member lists in etcd and return
// back a combined list of client urls for each member in the cluster
func ( e * ETCD ) GetMembersClientURLs ( ctx context . Context ) ( [ ] string , error ) {
2023-02-13 20:00:52 +00:00
return e . client . Endpoints ( ) , nil
2021-02-12 15:35:57 +00:00
}
2021-03-01 21:50:50 +00:00
2021-09-14 15:20:38 +00:00
// GetMembersNames will list through the member lists in etcd and return
// back a combined list of member names
func ( e * ETCD ) GetMembersNames ( ctx context . Context ) ( [ ] string , error ) {
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
defer cancel ( )
members , err := e . client . MemberList ( ctx )
if err != nil {
return nil , err
}
var memberNames [ ] string
for _ , member := range members . Members {
memberNames = append ( memberNames , member . Name )
}
return memberNames , nil
}
2023-10-13 22:47:12 +00:00
// RemoveSelf will remove the member if it exists in the cluster. This should
// only be called on a node that may have previously run etcd, but will not
// currently run etcd, to ensure that it is not a member of the cluster.
// This is also called by tests to do cleanup between runs.
2021-03-01 21:50:50 +00:00
func ( e * ETCD ) RemoveSelf ( ctx context . Context ) error {
2023-10-13 22:47:12 +00:00
if e . client == nil {
if err := e . startClient ( ctx ) ; err != nil {
return err
}
}
2021-09-14 15:20:38 +00:00
if err := e . RemovePeer ( ctx , e . name , e . address , true ) ; err != nil {
2021-03-16 16:14:43 +00:00
return err
}
// backup the data dir to avoid issues when re-enabling etcd
2023-09-21 18:48:21 +00:00
oldDataDir := dbDir ( e . config ) + "-old-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
2021-03-16 16:14:43 +00:00
// move the data directory to a temp path
2023-09-21 18:48:21 +00:00
return os . Rename ( dbDir ( e . config ) , oldDataDir )
2021-03-01 21:50:50 +00:00
}