2020-05-05 22:02:16 +00:00
package etcd
import (
2022-01-14 17:31:22 +00:00
"archive/zip"
2020-10-27 18:06:26 +00:00
"bytes"
2020-05-05 22:02:16 +00:00
"context"
"crypto/tls"
2021-11-29 18:30:04 +00:00
"encoding/base64"
2020-05-05 22:02:16 +00:00
"encoding/json"
"fmt"
2022-01-14 17:31:22 +00:00
"io"
2022-10-08 00:36:57 +00:00
"io/fs"
2023-01-10 18:51:39 +00:00
"math/rand"
2022-04-15 00:31:49 +00:00
"net"
2020-05-05 22:02:16 +00:00
"net/http"
"net/url"
"os"
"path/filepath"
2022-06-14 22:12:28 +00:00
"regexp"
2021-05-01 01:26:39 +00:00
"runtime"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
"sort"
"strconv"
2020-05-05 22:02:16 +00:00
"strings"
"time"
"github.com/google/uuid"
"github.com/gorilla/mux"
2022-03-02 23:47:27 +00:00
"github.com/k3s-io/k3s/pkg/clientaccess"
"github.com/k3s-io/k3s/pkg/daemons/config"
"github.com/k3s-io/k3s/pkg/daemons/control/deps"
"github.com/k3s-io/k3s/pkg/daemons/executor"
"github.com/k3s-io/k3s/pkg/version"
2021-04-26 16:47:53 +00:00
"github.com/k3s-io/kine/pkg/client"
endpoint2 "github.com/k3s-io/kine/pkg/endpoint"
2021-05-01 01:26:39 +00:00
"github.com/minio/minio-go/v7"
2022-02-24 22:35:08 +00:00
cp "github.com/otiai10/copy"
2020-05-05 22:02:16 +00:00
"github.com/pkg/errors"
certutil "github.com/rancher/dynamiclistener/cert"
2021-09-14 15:20:38 +00:00
controllerv1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
"github.com/robfig/cron/v3"
2020-05-05 22:02:16 +00:00
"github.com/sirupsen/logrus"
2021-07-02 19:55:47 +00:00
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
2022-04-13 23:22:07 +00:00
"go.etcd.io/etcd/client/pkg/v3/logutil"
2021-07-02 19:55:47 +00:00
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/etcdutl/v3/snapshot"
2022-04-13 23:22:07 +00:00
"go.uber.org/zap"
2022-07-09 01:27:05 +00:00
"golang.org/x/sync/semaphore"
2021-05-01 01:26:39 +00:00
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
2020-10-27 18:06:26 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2020-05-05 22:02:16 +00:00
utilnet "k8s.io/apimachinery/pkg/util/net"
2023-01-10 18:51:39 +00:00
"k8s.io/apimachinery/pkg/util/wait"
2021-05-01 01:26:39 +00:00
"k8s.io/client-go/util/retry"
2021-02-12 15:35:57 +00:00
)
2020-10-27 18:06:26 +00:00
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
const (
2022-07-20 00:21:23 +00:00
testTimeout = time . Second * 30
2021-08-05 20:32:01 +00:00
manageTickerTime = time . Second * 15
learnerMaxStallTime = time . Minute * 5
memberRemovalTimeout = time . Minute * 1
2020-10-27 18:06:26 +00:00
2023-01-10 18:51:39 +00:00
// snapshotJitterMax defines the maximum time skew on cron-triggered snapshots. The actual jitter
// will be a random Duration somewhere between 0 and snapshotJitterMax.
snapshotJitterMax = time . Second * 5
2020-10-27 18:06:26 +00:00
// defaultDialTimeout is intentionally short so that connections timeout within the testTimeout defined above
defaultDialTimeout = 2 * time . Second
// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
defaultKeepAliveTime = 30 * time . Second
defaultKeepAliveTimeout = 10 * time . Second
2020-12-07 20:30:44 +00:00
2022-07-09 01:27:05 +00:00
maxBackupRetention = 5
maxConcurrentSnapshots = 1
compressedExtension = ".zip"
2021-09-14 15:20:38 +00:00
MasterLabel = "node-role.kubernetes.io/master"
ControlPlaneLabel = "node-role.kubernetes.io/control-plane"
EtcdRoleLabel = "node-role.kubernetes.io/etcd"
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
)
2021-05-01 01:26:39 +00:00
var (
learnerProgressKey = version . Program + "/etcd/learnerProgress"
// AddressKey will contain the value of api addresses list
AddressKey = version . Program + "/apiaddresses"
2021-11-29 18:30:04 +00:00
snapshotExtraMetadataConfigMapName = version . Program + "-etcd-snapshot-extra-metadata"
snapshotConfigMapName = version . Program + "-etcd-snapshots"
2021-09-14 15:20:38 +00:00
2023-01-10 18:51:39 +00:00
// snapshotDataBackoff will retry at increasing steps for up to ~30 seconds.
// If the ConfigMap update fails, the list won't be reconciled again until next time
// the server starts, so we should be fairly persistent in retrying.
snapshotDataBackoff = wait . Backoff {
Steps : 9 ,
Duration : 10 * time . Millisecond ,
Factor : 3.0 ,
Jitter : 0.1 ,
}
// cronLogger wraps logrus's Printf output as cron-compatible logger
cronLogger = cron . VerbosePrintfLogger ( logrus . StandardLogger ( ) )
2021-09-14 15:20:38 +00:00
NodeNameAnnotation = "etcd." + version . Program + ".cattle.io/node-name"
NodeAddressAnnotation = "etcd." + version . Program + ".cattle.io/node-address"
2022-02-16 22:19:58 +00:00
ErrAddressNotSet = errors . New ( "apiserver addresses not yet set" )
2022-03-01 23:55:35 +00:00
ErrNotMember = errNotMember ( )
2022-06-14 22:12:28 +00:00
invalidKeyChars = regexp . MustCompile ( ` [^-._a-zA-Z0-9] ` )
2021-05-01 01:26:39 +00:00
)
2021-09-14 15:20:38 +00:00
type NodeControllerGetter func ( ) controllerv1 . NodeController
2021-05-01 01:26:39 +00:00
type ETCD struct {
2022-07-09 01:27:05 +00:00
client * clientv3 . Client
config * config . Control
name string
address string
cron * cron . Cron
s3 * S3
cancel context . CancelFunc
snapshotSem * semaphore . Weighted
2021-05-01 01:26:39 +00:00
}
type learnerProgress struct {
ID uint64 ` json:"id,omitempty" `
Name string ` json:"name,omitempty" `
RaftAppliedIndex uint64 ` json:"raftAppliedIndex,omitempty" `
LastProgress metav1 . Time ` json:"lastProgress,omitempty" `
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// Members contains a slice that holds all
// members of the cluster.
2020-05-05 22:02:16 +00:00
type Members struct {
Members [ ] * etcdserverpb . Member ` json:"members" `
}
2022-03-01 23:55:35 +00:00
type MembershipError struct {
Self string
Members [ ] string
}
func ( e * MembershipError ) Error ( ) string {
return fmt . Sprintf ( "this server is a not a member of the etcd cluster. Found %v, expect: %s" , e . Members , e . Self )
}
func ( e * MembershipError ) Is ( target error ) bool {
switch target {
case ErrNotMember :
return true
}
return false
}
func errNotMember ( ) error { return & MembershipError { } }
2021-05-01 01:26:39 +00:00
// NewETCD creates a new value of type
// ETCD with an initialized cron value.
func NewETCD ( ) * ETCD {
return & ETCD {
2023-01-10 18:51:39 +00:00
cron : cron . New ( cron . WithLogger ( cronLogger ) ) ,
2021-05-01 01:26:39 +00:00
}
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
// EndpointName returns the name of the endpoint.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) EndpointName ( ) string {
return "etcd"
}
2021-05-07 23:10:04 +00:00
// SetControlConfig sets the given config on the etcd struct.
2022-02-24 22:35:08 +00:00
func ( e * ETCD ) SetControlConfig ( ctx context . Context , config * config . Control ) error {
2021-05-07 23:10:04 +00:00
e . config = config
2022-02-24 22:35:08 +00:00
2022-04-12 16:59:47 +00:00
client , err := GetClient ( ctx , e . config )
2022-02-24 22:35:08 +00:00
if err != nil {
return err
}
e . client = client
2022-03-10 22:03:02 +00:00
go func ( ) {
<- ctx . Done ( )
e . client . Close ( )
} ( )
2023-03-24 22:19:44 +00:00
address , err := getAdvertiseAddress ( config . PrivateIP )
2022-02-24 22:35:08 +00:00
if err != nil {
return err
}
e . address = address
return e . setName ( false )
2021-05-07 23:10:04 +00:00
}
2022-03-25 18:52:40 +00:00
// Test ensures that the local node is a voting member of the target cluster,
// and that the datastore is defragmented and not in maintenance mode due to alarms.
2020-10-27 18:06:26 +00:00
// If it is still a learner or not a part of the cluster, an error is raised.
2022-03-25 18:52:40 +00:00
// If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised.
2020-10-27 18:06:26 +00:00
func ( e * ETCD ) Test ( ctx context . Context ) error {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
2020-05-05 22:02:16 +00:00
defer cancel ( )
2020-10-27 18:06:26 +00:00
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
status , err := e . client . Status ( ctx , endpoints [ 0 ] )
2020-07-29 20:52:49 +00:00
if err != nil {
return err
}
if status . IsLearner {
2020-10-27 18:06:26 +00:00
return errors . New ( "this server has not yet been promoted from learner to voting member" )
2020-07-29 20:52:49 +00:00
}
2020-10-27 18:06:26 +00:00
2022-03-25 18:52:40 +00:00
if err := e . defragment ( ctx ) ; err != nil {
return errors . Wrap ( err , "failed to defragment etcd database" )
}
2022-02-23 21:52:46 +00:00
if err := e . clearAlarms ( ctx ) ; err != nil {
return errors . Wrap ( err , "failed to report and disarm etcd alarms" )
}
2022-03-25 18:52:40 +00:00
// refresh status to see if any errors remain after clearing alarms
status , err = e . client . Status ( ctx , endpoints [ 0 ] )
if err != nil {
return err
}
if len ( status . Errors ) > 0 {
return fmt . Errorf ( "etcd cluster errors: %s" , strings . Join ( status . Errors , ", " ) )
}
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return err
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
var memberNameUrls [ ] string
2020-05-05 22:02:16 +00:00
for _ , member := range members . Members {
for _ , peerURL := range member . PeerURLs {
if peerURL == e . peerURL ( ) && e . name == member . Name {
return nil
}
}
if len ( member . PeerURLs ) > 0 {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
memberNameUrls = append ( memberNameUrls , member . Name + "=" + member . PeerURLs [ 0 ] )
2020-05-05 22:02:16 +00:00
}
}
2022-07-29 16:40:53 +00:00
return & MembershipError { Members : memberNameUrls , Self : e . name + "=" + e . peerURL ( ) }
2020-05-05 22:02:16 +00:00
}
2021-11-10 12:33:42 +00:00
// DBDir returns the path to dataDir/db/etcd
func DBDir ( config * config . Control ) string {
2020-09-24 05:59:58 +00:00
return filepath . Join ( config . DataDir , "db" , "etcd" )
2020-05-05 22:02:16 +00:00
}
2020-09-24 05:59:58 +00:00
// walDir returns the path to etcdDBDir/member/wal
func walDir ( config * config . Control ) string {
2021-11-10 12:33:42 +00:00
return filepath . Join ( DBDir ( config ) , "member" , "wal" )
2020-05-05 22:02:16 +00:00
}
2021-04-26 16:47:53 +00:00
func sqliteFile ( config * config . Control ) string {
return filepath . Join ( config . DataDir , "db" , "state.db" )
}
2021-03-03 18:14:12 +00:00
// nameFile returns the path to etcdDBDir/name.
2020-05-05 22:02:16 +00:00
func nameFile ( config * config . Control ) string {
2021-11-10 12:33:42 +00:00
return filepath . Join ( DBDir ( config ) , "name" )
2020-05-05 22:02:16 +00:00
}
2021-03-03 18:14:12 +00:00
// ResetFile returns the path to etcdDBDir/reset-flag.
2020-09-30 00:53:31 +00:00
func ResetFile ( config * config . Control ) string {
return filepath . Join ( config . DataDir , "db" , "reset-flag" )
}
2020-09-22 03:23:18 +00:00
// IsInitialized checks to see if a WAL directory exists. If so, we assume that etcd
// has already been brought up at least once.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) IsInitialized ( ctx context . Context , config * config . Control ) ( bool , error ) {
2020-09-22 03:23:18 +00:00
dir := walDir ( config )
if s , err := os . Stat ( dir ) ; err == nil && s . IsDir ( ) {
2020-05-05 22:02:16 +00:00
return true , nil
} else if os . IsNotExist ( err ) {
return false , nil
} else {
2021-10-07 19:47:00 +00:00
return false , errors . Wrap ( err , "invalid state for wal directory " + dir )
2020-05-05 22:02:16 +00:00
}
}
2021-11-10 12:33:42 +00:00
// Reset resets an etcd node to a single node cluster.
2021-03-11 20:07:40 +00:00
func ( e * ETCD ) Reset ( ctx context . Context , rebootstrap func ( ) error ) error {
2020-09-22 03:23:18 +00:00
// Wait for etcd to come up as a new single-node cluster, then exit
2020-05-05 22:02:16 +00:00
go func ( ) {
2022-02-24 19:01:14 +00:00
<- e . config . Runtime . AgentReady
2020-07-29 20:52:49 +00:00
t := time . NewTicker ( 5 * time . Second )
defer t . Stop ( )
for range t . C {
2021-12-14 00:04:39 +00:00
// resetting the apiaddresses to nil since we are doing a restoration
if _ , err := e . client . Put ( ctx , AddressKey , "" ) ; err != nil {
logrus . Warnf ( "failed to reset api addresses key in etcd: %v" , err )
continue
}
2020-10-27 18:06:26 +00:00
if err := e . Test ( ctx ) ; err == nil {
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
continue
}
2021-12-09 21:54:27 +00:00
if rebootstrap != nil {
// storageBootstrap() - runtime structure has been written with correct certificate data
if err := rebootstrap ( ) ; err != nil {
logrus . Fatal ( err )
}
2021-03-03 18:14:12 +00:00
}
// call functions to rewrite them from daemons/control/server.go (prepare())
2022-02-24 19:01:14 +00:00
if err := deps . GenServerDeps ( e . config ) ; err != nil {
2021-03-03 18:14:12 +00:00
logrus . Fatal ( err )
}
2020-05-05 22:02:16 +00:00
if len ( members . Members ) == 1 && members . Members [ 0 ] . Name == e . name {
2022-04-27 20:44:15 +00:00
// Cancel the etcd server context and allow it time to shutdown cleanly.
// Ideally we would use a waitgroup and properly sequence shutdown of the various components.
e . cancel ( )
time . Sleep ( time . Second * 5 )
logrus . Infof ( "Managed etcd cluster membership has been reset, restart without --cluster-reset flag now. Backup and delete ${datadir}/server/db on each peer etcd server and rejoin the nodes" )
2020-05-05 22:02:16 +00:00
os . Exit ( 0 )
}
2021-10-22 22:25:29 +00:00
} else {
// make sure that peer ips are updated to the node ip in case the test fails
members , err := e . client . MemberList ( ctx )
if err != nil {
logrus . Warnf ( "failed to list etcd members: %v" , err )
continue
}
if len ( members . Members ) > 1 {
logrus . Warnf ( "failed to update peer url: etcd still has more than one member" )
continue
}
if _ , err := e . client . MemberUpdate ( ctx , members . Members [ 0 ] . ID , [ ] string { e . peerURL ( ) } ) ; err != nil {
logrus . Warnf ( "failed to update peer url: %v" , err )
continue
}
2020-05-05 22:02:16 +00:00
}
2021-10-22 22:25:29 +00:00
2020-05-05 22:02:16 +00:00
}
} ( )
2020-09-22 03:23:18 +00:00
// If asked to restore from a snapshot, do so
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if e . config . ClusterResetRestorePath != "" {
2021-03-03 18:14:12 +00:00
if e . config . EtcdS3 {
2021-05-07 23:10:04 +00:00
if err := e . initS3IfNil ( ctx ) ; err != nil {
return err
2021-03-03 18:14:12 +00:00
}
logrus . Infof ( "Retrieving etcd snapshot %s from S3" , e . config . ClusterResetRestorePath )
2021-06-30 20:29:03 +00:00
if err := e . s3 . Download ( ctx ) ; err != nil {
2021-03-03 18:14:12 +00:00
return err
}
logrus . Infof ( "S3 download complete for %s" , e . config . ClusterResetRestorePath )
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
info , err := os . Stat ( e . config . ClusterResetRestorePath )
if os . IsNotExist ( err ) {
return fmt . Errorf ( "etcd: snapshot path does not exist: %s" , e . config . ClusterResetRestorePath )
}
if info . IsDir ( ) {
2020-09-22 03:23:18 +00:00
return fmt . Errorf ( "etcd: snapshot path must be a file, not a directory: %s" , e . config . ClusterResetRestorePath )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-19 01:09:36 +00:00
if err := e . Restore ( ctx ) ; err != nil {
return err
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-22 03:23:18 +00:00
2020-09-19 01:09:36 +00:00
if err := e . setName ( true ) ; err != nil {
return err
}
2020-09-30 00:53:31 +00:00
// touch a file to avoid multiple resets
2022-10-08 00:36:57 +00:00
if err := os . WriteFile ( ResetFile ( e . config ) , [ ] byte { } , 0600 ) ; err != nil {
2020-09-30 00:53:31 +00:00
return err
}
2020-05-05 22:02:16 +00:00
return e . newCluster ( ctx , true )
}
2020-09-22 03:23:18 +00:00
// Start starts the datastore
2020-07-29 20:52:49 +00:00
func ( e * ETCD ) Start ( ctx context . Context , clientAccessInfo * clientaccess . Info ) error {
2022-02-16 22:19:58 +00:00
isInitialized , err := e . IsInitialized ( ctx , e . config )
2020-05-05 22:02:16 +00:00
if err != nil {
2020-09-22 03:23:18 +00:00
return errors . Wrapf ( err , "configuration validation failed" )
2020-05-05 22:02:16 +00:00
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if ! e . config . EtcdDisableSnapshots {
e . setSnapshotFunction ( ctx )
e . cron . Start ( )
}
2020-10-27 18:06:26 +00:00
go e . manageLearners ( ctx )
2022-02-16 22:19:58 +00:00
if isInitialized {
2021-01-06 17:05:49 +00:00
//check etcd dir permission
2021-11-10 12:33:42 +00:00
etcdDir := DBDir ( e . config )
2021-01-06 17:05:49 +00:00
info , err := os . Stat ( etcdDir )
if err != nil {
return err
}
if info . Mode ( ) != 0700 {
if err := os . Chmod ( etcdDir , 0700 ) ; err != nil {
return err
}
}
2020-05-05 22:02:16 +00:00
opt , err := executor . CurrentETCDOptions ( )
if err != nil {
return err
}
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd for existing cluster member" )
2020-05-05 22:02:16 +00:00
return e . cluster ( ctx , false , opt )
}
2020-07-29 20:52:49 +00:00
if clientAccessInfo == nil {
2020-05-05 22:02:16 +00:00
return e . newCluster ( ctx , false )
}
2021-05-01 01:26:39 +00:00
2021-10-12 06:13:10 +00:00
go func ( ) {
2022-02-16 22:19:58 +00:00
for {
select {
case <- time . After ( 30 * time . Second ) :
logrus . Infof ( "Waiting for agent to become ready before joining ETCD cluster" )
case <- e . config . Runtime . AgentReady :
if err := e . join ( ctx , clientAccessInfo ) ; err != nil {
logrus . Fatalf ( "ETCD join failed: %v" , err )
}
return
case <- ctx . Done ( ) :
return
}
2021-10-12 06:13:10 +00:00
}
} ( )
return nil
2020-05-05 22:02:16 +00:00
}
2020-09-22 03:23:18 +00:00
// join attempts to add a member to an existing cluster
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) join ( ctx context . Context , clientAccessInfo * clientaccess . Info ) error {
2021-09-15 05:13:31 +00:00
clientCtx , cancel := context . WithTimeout ( ctx , 20 * time . Second )
defer cancel ( )
var (
cluster [ ] string
add = true
)
clientURLs , memberList , err := ClientURLs ( clientCtx , clientAccessInfo , e . config . PrivateIP )
2020-05-05 22:02:16 +00:00
if err != nil {
return err
}
2022-04-12 16:59:47 +00:00
client , err := GetClient ( clientCtx , e . config , clientURLs ... )
2020-05-05 22:02:16 +00:00
if err != nil {
return err
}
2020-10-27 18:06:26 +00:00
defer client . Close ( )
2020-05-05 22:02:16 +00:00
2021-09-15 05:13:31 +00:00
members , err := client . MemberList ( clientCtx )
2020-05-05 22:02:16 +00:00
if err != nil {
2020-09-21 16:56:03 +00:00
logrus . Errorf ( "Failed to get member list from etcd cluster. Will assume this member is already added" )
2021-07-02 19:55:47 +00:00
members = & clientv3 . MemberListResponse {
2020-05-05 22:02:16 +00:00
Members : append ( memberList . Members , & etcdserverpb . Member {
Name : e . name ,
PeerURLs : [ ] string { e . peerURL ( ) } ,
} ) ,
}
add = false
}
for _ , member := range members . Members {
for _ , peer := range member . PeerURLs {
u , err := url . Parse ( peer )
if err != nil {
return err
}
2021-12-18 06:26:04 +00:00
// An uninitialized joining member won't have a name; if it has our
// address it must be us.
2020-05-05 22:02:16 +00:00
if member . Name == "" && u . Hostname ( ) == e . address {
member . Name = e . name
}
2021-12-18 06:26:04 +00:00
// If we're already in the cluster, don't try to add ourselves.
if member . Name == e . name && u . Hostname ( ) == e . address {
add = false
}
2020-05-05 22:02:16 +00:00
if len ( member . PeerURLs ) > 0 {
cluster = append ( cluster , fmt . Sprintf ( "%s=%s" , member . Name , member . PeerURLs [ 0 ] ) )
}
}
2021-12-18 06:26:04 +00:00
// Try to get the node name from the member name
memberNodeName := member . Name
if lastHyphen := strings . LastIndex ( member . Name , "-" ) ; lastHyphen > 1 {
memberNodeName = member . Name [ : lastHyphen ]
}
// Make sure there's not already a member in the cluster with a duplicate node name
if member . Name != e . name && memberNodeName == e . config . ServerNodeName {
// make sure to remove the name file if a duplicate node name is used, so that we
// generate a new member name when our node name is fixed.
nameFile := nameFile ( e . config )
if err := os . Remove ( nameFile ) ; err != nil {
logrus . Errorf ( "Failed to remove etcd name file %s: %v" , nameFile , err )
}
return errors . New ( "duplicate node name found, please use a unique name for this node" )
}
2020-05-05 22:02:16 +00:00
}
if add {
2021-10-12 06:13:10 +00:00
logrus . Infof ( "Adding member %s=%s to etcd cluster %v" , e . name , e . peerURL ( ) , cluster )
2021-09-15 05:13:31 +00:00
if _ , err = client . MemberAddAsLearner ( clientCtx , [ ] string { e . peerURL ( ) } ) ; err != nil {
2020-05-05 22:02:16 +00:00
return err
}
cluster = append ( cluster , fmt . Sprintf ( "%s=%s" , e . name , e . peerURL ( ) ) )
}
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd to join cluster with members %v" , cluster )
2020-05-05 22:02:16 +00:00
return e . cluster ( ctx , false , executor . InitialOptions {
Cluster : strings . Join ( cluster , "," ) ,
State : "existing" ,
} )
}
2020-09-24 05:40:00 +00:00
// Register configures a new etcd client and adds db info routes for the http request handler.
func ( e * ETCD ) Register ( ctx context . Context , config * config . Control , handler http . Handler ) ( http . Handler , error ) {
2020-05-05 22:02:16 +00:00
e . config = config
2022-04-12 16:59:47 +00:00
client , err := GetClient ( ctx , e . config )
2020-05-05 22:02:16 +00:00
if err != nil {
2020-09-24 05:40:00 +00:00
return nil , err
2020-05-05 22:02:16 +00:00
}
e . client = client
2022-03-10 22:03:02 +00:00
go func ( ) {
<- ctx . Done ( )
e . client . Close ( )
} ( )
2023-03-24 22:19:44 +00:00
address , err := getAdvertiseAddress ( config . PrivateIP )
2020-05-05 22:02:16 +00:00
if err != nil {
2020-09-24 05:40:00 +00:00
return nil , err
2020-05-05 22:02:16 +00:00
}
e . address = address
2022-02-24 22:35:08 +00:00
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( config )
2022-02-24 22:35:08 +00:00
e . config . Datastore . Endpoint = endpoints [ 0 ]
2022-02-24 19:01:14 +00:00
e . config . Datastore . BackendTLSConfig . CAFile = e . config . Runtime . ETCDServerCA
e . config . Datastore . BackendTLSConfig . CertFile = e . config . Runtime . ClientETCDCert
e . config . Datastore . BackendTLSConfig . KeyFile = e . config . Runtime . ClientETCDKey
2020-05-05 22:02:16 +00:00
2023-02-08 00:37:10 +00:00
e . config . Runtime . ClusterControllerStarts [ "etcd-node-metadata" ] = func ( ctx context . Context ) {
registerMetadataHandlers ( ctx , e )
2021-09-14 15:20:38 +00:00
}
2023-02-08 00:37:10 +00:00
// The apiserver endpoint controller needs to run on a node with a local apiserver,
2023-02-13 20:00:52 +00:00
// in order to successfully seed etcd with the endpoint list. The member removal controller
// also needs to run on a non-etcd node as to avoid disruption if running on the node that
// is being removed from the cluster.
2023-02-08 00:37:10 +00:00
if ! e . config . DisableAPIServer {
2023-02-13 20:00:52 +00:00
e . config . Runtime . LeaderElectedClusterControllerStarts [ version . Program + "-etcd" ] = func ( ctx context . Context ) {
2023-02-08 00:37:10 +00:00
registerEndpointsHandlers ( ctx , e )
2023-02-13 20:00:52 +00:00
registerMemberHandlers ( ctx , e )
2023-02-08 00:37:10 +00:00
}
2020-05-05 22:02:16 +00:00
}
2022-04-06 00:11:24 +00:00
2023-02-13 20:00:52 +00:00
// Tombstone file checking is unnecessary if we're not running etcd.
2023-02-08 00:37:10 +00:00
if ! e . config . DisableETCD {
tombstoneFile := filepath . Join ( DBDir ( e . config ) , "tombstone" )
if _ , err := os . Stat ( tombstoneFile ) ; err == nil {
logrus . Infof ( "tombstone file has been detected, removing data dir to rejoin the cluster" )
if _ , err := backupDirWithRetention ( DBDir ( e . config ) , maxBackupRetention ) ; err != nil {
return nil , err
}
}
if err := e . setName ( false ) ; err != nil {
return nil , err
}
2020-12-07 20:30:44 +00:00
}
2021-09-14 15:20:38 +00:00
2023-02-08 00:37:10 +00:00
return e . handler ( handler ) , nil
2020-05-05 22:02:16 +00:00
}
2020-09-22 03:23:18 +00:00
// setName sets a unique name for this cluster member. The first time this is called,
// or if force is set to true, a new name will be generated and written to disk. The persistent
// name is used on subsequent calls.
2020-09-19 01:09:36 +00:00
func ( e * ETCD ) setName ( force bool ) error {
2020-05-05 22:02:16 +00:00
fileName := nameFile ( e . config )
2022-10-08 00:36:57 +00:00
data , err := os . ReadFile ( fileName )
2020-09-19 01:09:36 +00:00
if os . IsNotExist ( err ) || force {
2021-09-17 22:51:18 +00:00
e . name = e . config . ServerNodeName + "-" + uuid . New ( ) . String ( ) [ : 8 ]
2020-09-24 06:01:35 +00:00
if err := os . MkdirAll ( filepath . Dir ( fileName ) , 0700 ) ; err != nil {
2020-05-05 22:02:16 +00:00
return err
}
2022-10-08 00:36:57 +00:00
return os . WriteFile ( fileName , [ ] byte ( e . name ) , 0600 )
2020-05-05 22:02:16 +00:00
} else if err != nil {
return err
}
e . name = string ( data )
return nil
}
2020-09-24 06:29:25 +00:00
// handler wraps the handler with routes for database info
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) handler ( next http . Handler ) http . Handler {
2022-04-04 21:54:50 +00:00
mux := mux . NewRouter ( ) . SkipClean ( true )
2020-05-05 22:02:16 +00:00
mux . Handle ( "/db/info" , e . infoHandler ( ) )
mux . NotFoundHandler = next
return mux
}
2021-10-12 06:13:10 +00:00
// infoHandler returns etcd cluster information. This is used by new members when joining the cluster.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) infoHandler ( ) http . Handler {
return http . HandlerFunc ( func ( rw http . ResponseWriter , req * http . Request ) {
ctx , cancel := context . WithTimeout ( req . Context ( ) , 2 * time . Second )
defer cancel ( )
members , err := e . client . MemberList ( ctx )
if err != nil {
json . NewEncoder ( rw ) . Encode ( & Members {
Members : [ ] * etcdserverpb . Member {
{
Name : e . name ,
PeerURLs : [ ] string { e . peerURL ( ) } ,
ClientURLs : [ ] string { e . clientURL ( ) } ,
} ,
} ,
} )
return
}
rw . Header ( ) . Set ( "Content-Type" , "application/json" )
json . NewEncoder ( rw ) . Encode ( & Members {
Members : members . Members ,
} )
} )
}
2022-02-24 22:35:08 +00:00
// GetClient returns an etcd client connected to the specified endpoints.
// If no endpoints are provided, endpoints are retrieved from the provided runtime config.
// If the runtime config does not list any endpoints, the default endpoint is used.
// The returned client should be closed when no longer needed, in order to avoid leaking GRPC
// client goroutines.
2022-04-12 16:59:47 +00:00
func GetClient ( ctx context . Context , control * config . Control , endpoints ... string ) ( * clientv3 . Client , error ) {
cfg , err := getClientConfig ( ctx , control , endpoints ... )
2020-05-05 22:02:16 +00:00
if err != nil {
return nil , err
}
2021-11-10 12:33:42 +00:00
2021-07-02 19:55:47 +00:00
return clientv3 . New ( * cfg )
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
// getClientConfig generates an etcd client config connected to the specified endpoints.
// If no endpoints are provided, getEndpoints is called to provide defaults.
2022-04-12 16:59:47 +00:00
func getClientConfig ( ctx context . Context , control * config . Control , endpoints ... string ) ( * clientv3 . Config , error ) {
runtime := control . Runtime
2022-02-24 22:35:08 +00:00
if len ( endpoints ) == 0 {
2022-04-12 16:59:47 +00:00
endpoints = getEndpoints ( control )
2022-02-24 22:35:08 +00:00
}
2022-03-29 18:45:21 +00:00
config := & clientv3 . Config {
2020-09-22 03:23:18 +00:00
Endpoints : endpoints ,
Context : ctx ,
DialTimeout : defaultDialTimeout ,
DialKeepAliveTime : defaultKeepAliveTime ,
2020-10-27 18:06:26 +00:00
DialKeepAliveTimeout : defaultKeepAliveTimeout ,
2023-02-13 20:00:52 +00:00
AutoSyncInterval : defaultKeepAliveTimeout ,
PermitWithoutStream : true ,
2022-03-29 18:45:21 +00:00
}
var err error
if strings . HasPrefix ( endpoints [ 0 ] , "https://" ) {
config . TLS , err = toTLSConfig ( runtime )
}
return config , err
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
// getEndpoints returns the endpoints from the runtime config if set, otherwise the default endpoint.
2022-04-12 16:59:47 +00:00
func getEndpoints ( control * config . Control ) [ ] string {
runtime := control . Runtime
2022-02-24 22:35:08 +00:00
if len ( runtime . EtcdConfig . Endpoints ) > 0 {
return runtime . EtcdConfig . Endpoints
}
2022-07-21 21:40:09 +00:00
return [ ] string { fmt . Sprintf ( "https://%s:2379" , control . Loopback ( true ) ) }
2022-02-24 22:35:08 +00:00
}
2020-09-22 03:23:18 +00:00
// toTLSConfig converts the ControlRuntime configuration to TLS configuration suitable
// for use by etcd.
2020-05-05 22:02:16 +00:00
func toTLSConfig ( runtime * config . ControlRuntime ) ( * tls . Config , error ) {
2021-10-12 06:13:10 +00:00
if runtime . ClientETCDCert == "" || runtime . ClientETCDKey == "" || runtime . ETCDServerCA == "" {
return nil , errors . New ( "runtime is not ready yet" )
}
2020-05-05 22:02:16 +00:00
clientCert , err := tls . LoadX509KeyPair ( runtime . ClientETCDCert , runtime . ClientETCDKey )
if err != nil {
return nil , err
}
pool , err := certutil . NewPool ( runtime . ETCDServerCA )
if err != nil {
return nil , err
}
return & tls . Config {
RootCAs : pool ,
Certificates : [ ] tls . Certificate { clientCert } ,
} , nil
}
2020-09-22 03:23:18 +00:00
// getAdvertiseAddress returns the IP address best suited for advertising to clients
2023-03-24 22:19:44 +00:00
func getAdvertiseAddress ( advertiseIP string ) ( string , error ) {
2020-05-05 22:02:16 +00:00
ip := advertiseIP
if ip == "" {
ipAddr , err := utilnet . ChooseHostInterface ( )
if err != nil {
return "" , err
}
ip = ipAddr . String ( )
}
return ip , nil
}
2020-09-22 03:23:18 +00:00
// newCluster returns options to set up etcd for a new cluster
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) newCluster ( ctx context . Context , reset bool ) error {
2022-02-24 22:35:08 +00:00
logrus . Infof ( "Starting etcd for new cluster" )
2021-04-26 16:47:53 +00:00
err := e . cluster ( ctx , reset , executor . InitialOptions {
2021-10-12 06:13:10 +00:00
AdvertisePeerURL : e . peerURL ( ) ,
Cluster : fmt . Sprintf ( "%s=%s" , e . name , e . peerURL ( ) ) ,
2020-05-05 22:02:16 +00:00
State : "new" ,
} )
2021-04-26 16:47:53 +00:00
if err != nil {
return err
}
if err := e . migrateFromSQLite ( ctx ) ; err != nil {
return fmt . Errorf ( "failed to migrate content from sqlite to etcd: %w" , err )
}
return nil
}
func ( e * ETCD ) migrateFromSQLite ( ctx context . Context ) error {
_ , err := os . Stat ( sqliteFile ( e . config ) )
if os . IsNotExist ( err ) {
return nil
} else if err != nil {
return err
}
logrus . Infof ( "Migrating content from sqlite to etcd" )
ctx , cancel := context . WithCancel ( ctx )
defer cancel ( )
_ , err = endpoint2 . Listen ( ctx , endpoint2 . Config {
Endpoint : endpoint2 . SQLiteBackend ,
} )
if err != nil {
return err
}
sqliteClient , err := client . New ( endpoint2 . ETCDConfig {
Endpoints : [ ] string { "unix://kine.sock" } ,
} )
if err != nil {
return err
}
defer sqliteClient . Close ( )
2022-04-12 16:59:47 +00:00
etcdClient , err := GetClient ( ctx , e . config )
2021-04-26 16:47:53 +00:00
if err != nil {
return err
}
defer etcdClient . Close ( )
values , err := sqliteClient . List ( ctx , "/registry/" , 0 )
if err != nil {
return err
}
for _ , value := range values {
logrus . Infof ( "Migrating etcd key %s" , value . Key )
_ , err := etcdClient . Put ( ctx , string ( value . Key ) , string ( value . Data ) )
if err != nil {
return err
}
}
return os . Rename ( sqliteFile ( e . config ) , sqliteFile ( e . config ) + ".migrated" )
2020-05-05 22:02:16 +00:00
}
2022-05-05 08:10:08 +00:00
// peerURL returns the external peer access address for the local node.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) peerURL ( ) string {
2022-04-15 00:31:49 +00:00
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . address , "2380" ) )
2020-05-05 22:02:16 +00:00
}
2022-05-05 08:10:08 +00:00
// listenClientURLs returns a list of URLs to bind to for peer connections.
// During cluster reset/restore, we only listen on loopback to avoid having peers
// connect mid-process.
func ( e * ETCD ) listenPeerURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
peerURLs := fmt . Sprintf ( "https://%s:2380" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset {
peerURLs += "," + e . peerURL ( )
}
return peerURLs
}
// clientURL returns the external client access address for the local node.
2020-05-05 22:02:16 +00:00
func ( e * ETCD ) clientURL ( ) string {
2022-04-15 00:31:49 +00:00
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . address , "2379" ) )
2020-05-05 22:02:16 +00:00
}
2023-03-24 22:19:44 +00:00
// advertiseClientURLs returns the advertised addresses for the local node.
// During cluster reset/restore we only listen on loopback to avoid having apiservers
// on other nodes connect mid-process.
func ( e * ETCD ) advertiseClientURLs ( reset bool ) string {
if reset {
return fmt . Sprintf ( "https://%s" , net . JoinHostPort ( e . config . Loopback ( true ) , "2379" ) )
}
return e . clientURL ( )
}
2022-05-05 08:10:08 +00:00
// listenClientURLs returns a list of URLs to bind to for client connections.
2023-03-24 22:19:44 +00:00
// During cluster reset/restore, we only listen on loopback to avoid having apiservers
// on other nodes connect mid-process.
2022-05-05 08:10:08 +00:00
func ( e * ETCD ) listenClientURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
clientURLs := fmt . Sprintf ( "https://%s:2379" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset {
clientURLs += "," + e . clientURL ( )
}
return clientURLs
}
// listenMetricsURLs returns a list of URLs to bind to for metrics connections.
func ( e * ETCD ) listenMetricsURLs ( reset bool ) string {
2022-07-21 21:40:09 +00:00
metricsURLs := fmt . Sprintf ( "http://%s:2381" , e . config . Loopback ( true ) )
2022-05-05 08:10:08 +00:00
if ! reset && e . config . EtcdExposeMetrics {
metricsURLs += "," + fmt . Sprintf ( "http://%s" , net . JoinHostPort ( e . address , "2381" ) )
2021-01-23 01:40:48 +00:00
}
2022-05-05 08:10:08 +00:00
return metricsURLs
2021-01-23 01:40:48 +00:00
}
2022-05-05 08:10:08 +00:00
// cluster calls the executor to start etcd running with the provided configuration.
func ( e * ETCD ) cluster ( ctx context . Context , reset bool , options executor . InitialOptions ) error {
2022-04-27 20:44:15 +00:00
ctx , e . cancel = context . WithCancel ( ctx )
2021-09-08 17:56:18 +00:00
return executor . ETCD ( ctx , executor . ETCDConfig {
2020-05-05 22:02:16 +00:00
Name : e . name ,
InitialOptions : options ,
2022-05-05 08:10:08 +00:00
ForceNewCluster : reset ,
ListenClientURLs : e . listenClientURLs ( reset ) ,
ListenMetricsURLs : e . listenMetricsURLs ( reset ) ,
ListenPeerURLs : e . listenPeerURLs ( reset ) ,
2023-03-24 22:19:44 +00:00
AdvertiseClientURLs : e . advertiseClientURLs ( reset ) ,
2021-11-10 12:33:42 +00:00
DataDir : DBDir ( e . config ) ,
2020-05-05 22:02:16 +00:00
ServerTrust : executor . ServerTrust {
CertFile : e . config . Runtime . ServerETCDCert ,
KeyFile : e . config . Runtime . ServerETCDKey ,
ClientCertAuth : true ,
TrustedCAFile : e . config . Runtime . ETCDServerCA ,
} ,
PeerTrust : executor . PeerTrust {
CertFile : e . config . Runtime . PeerServerClientETCDCert ,
KeyFile : e . config . Runtime . PeerServerClientETCDKey ,
ClientCertAuth : true ,
TrustedCAFile : e . config . Runtime . ETCDPeerCA ,
} ,
2022-05-05 08:10:08 +00:00
SnapshotCount : 10000 ,
2022-04-27 20:44:15 +00:00
ElectionTimeout : 5000 ,
HeartbeatInterval : 500 ,
Logger : "zap" ,
LogOutputs : [ ] string { "stderr" } ,
ExperimentalInitialCorruptCheck : true ,
2021-11-12 05:03:15 +00:00
} , e . config . ExtraEtcdArgs )
2020-05-05 22:02:16 +00:00
}
2022-02-24 22:35:08 +00:00
func ( e * ETCD ) StartEmbeddedTemporary ( ctx context . Context ) error {
etcdDataDir := DBDir ( e . config )
tmpDataDir := etcdDataDir + "-tmp"
os . RemoveAll ( tmpDataDir )
2022-03-25 18:52:40 +00:00
go func ( ) {
<- ctx . Done ( )
2022-02-24 22:35:08 +00:00
if err := os . RemoveAll ( tmpDataDir ) ; err != nil {
logrus . Warnf ( "Failed to remove etcd temp dir: %v" , err )
}
} ( )
if err := cp . Copy ( etcdDataDir , tmpDataDir , cp . Options { PreserveOwner : true } ) ; err != nil {
return err
}
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
clientURL := endpoints [ 0 ]
peerURL , err := addPort ( endpoints [ 0 ] , 1 )
if err != nil {
return err
}
embedded := executor . Embedded { }
2022-04-27 20:44:15 +00:00
ctx , e . cancel = context . WithCancel ( ctx )
2022-02-24 22:35:08 +00:00
return embedded . ETCD ( ctx , executor . ETCDConfig {
2022-04-27 20:44:15 +00:00
InitialOptions : executor . InitialOptions { AdvertisePeerURL : peerURL } ,
DataDir : tmpDataDir ,
ForceNewCluster : true ,
AdvertiseClientURLs : clientURL ,
ListenClientURLs : clientURL ,
ListenPeerURLs : peerURL ,
Logger : "zap" ,
HeartbeatInterval : 500 ,
ElectionTimeout : 5000 ,
2022-05-05 08:10:08 +00:00
SnapshotCount : 10000 ,
2022-04-27 20:44:15 +00:00
Name : e . name ,
LogOutputs : [ ] string { "stderr" } ,
ExperimentalInitialCorruptCheck : true ,
2022-03-25 18:52:40 +00:00
} , append ( e . config . ExtraAPIArgs , "--max-snapshots=0" , "--max-wals=0" ) )
2022-02-24 22:35:08 +00:00
}
func addPort ( address string , offset int ) ( string , error ) {
u , err := url . Parse ( address )
if err != nil {
return "" , err
}
port , err := strconv . Atoi ( u . Port ( ) )
if err != nil {
return "" , err
}
port += offset
return fmt . Sprintf ( "%s://%s:%d" , u . Scheme , u . Hostname ( ) , port ) , nil
}
2021-09-14 15:20:38 +00:00
// RemovePeer removes a peer from the cluster. The peer name and IP address must both match.
func ( e * ETCD ) RemovePeer ( ctx context . Context , name , address string , allowSelfRemoval bool ) error {
2021-08-05 20:32:01 +00:00
ctx , cancel := context . WithTimeout ( ctx , memberRemovalTimeout )
defer cancel ( )
2020-05-05 22:02:16 +00:00
members , err := e . client . MemberList ( ctx )
if err != nil {
return err
}
for _ , member := range members . Members {
2021-09-14 15:20:38 +00:00
if member . Name != name {
2020-05-05 22:02:16 +00:00
continue
}
for _ , peerURL := range member . PeerURLs {
u , err := url . Parse ( peerURL )
if err != nil {
return err
}
if u . Hostname ( ) == address {
2021-09-14 15:20:38 +00:00
if e . address == address && ! allowSelfRemoval {
return errors . New ( "not removing self from etcd cluster" )
2020-10-28 16:32:51 +00:00
}
2020-05-05 22:02:16 +00:00
logrus . Infof ( "Removing name=%s id=%d address=%s from etcd" , member . Name , member . ID , address )
_ , err := e . client . MemberRemove ( ctx , member . ID )
2023-04-05 00:52:14 +00:00
if errors . Is ( err , rpctypes . ErrGRPCMemberNotFound ) {
2020-10-28 16:32:51 +00:00
return nil
}
2020-05-05 22:02:16 +00:00
return err
}
}
}
return nil
}
2020-07-29 20:52:49 +00:00
2020-10-27 18:06:26 +00:00
// manageLearners monitors the etcd cluster to ensure that learners are making progress towards
// being promoted to full voting member. The checks only run on the cluster member that is
// the etcd leader.
2022-02-23 21:52:46 +00:00
func ( e * ETCD ) manageLearners ( ctx context . Context ) {
2022-02-24 19:01:14 +00:00
<- e . config . Runtime . AgentReady
2020-10-27 18:06:26 +00:00
t := time . NewTicker ( manageTickerTime )
2020-07-29 20:52:49 +00:00
defer t . Stop ( )
2020-10-27 18:06:26 +00:00
2020-07-29 20:52:49 +00:00
for range t . C {
2022-07-20 00:21:23 +00:00
ctx , cancel := context . WithTimeout ( ctx , manageTickerTime )
2020-10-27 18:06:26 +00:00
defer cancel ( )
// Check to see if the local node is the leader. Only the leader should do learner management.
2021-07-26 16:59:33 +00:00
if e . client == nil {
2022-02-23 21:52:46 +00:00
logrus . Debug ( "Etcd client was nil" )
2021-07-26 16:59:33 +00:00
continue
}
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
if status , err := e . client . Status ( ctx , endpoints [ 0 ] ) ; err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to check local etcd status for learner management: %v" , err )
continue
} else if status . Header . MemberId != status . Leader {
continue
}
progress , err := e . getLearnerProgress ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to get recorded learner progress from etcd: %v" , err )
2020-07-29 20:52:49 +00:00
continue
}
2020-10-27 18:06:26 +00:00
members , err := e . client . MemberList ( ctx )
2020-07-29 20:52:49 +00:00
if err != nil {
2020-10-27 18:06:26 +00:00
logrus . Errorf ( "Failed to get etcd members for learner management: %v" , err )
2020-07-29 20:52:49 +00:00
continue
}
2020-10-27 18:06:26 +00:00
2020-07-29 20:52:49 +00:00
for _ , member := range members . Members {
2020-10-27 18:06:26 +00:00
if member . IsLearner {
if err := e . trackLearnerProgress ( ctx , progress , member ) ; err != nil {
logrus . Errorf ( "Failed to track learner progress towards promotion: %v" , err )
}
2020-07-29 20:52:49 +00:00
break
}
}
}
2022-02-23 21:52:46 +00:00
return
2020-07-29 20:52:49 +00:00
}
2020-10-27 18:06:26 +00:00
// trackLearnerProcess attempts to promote a learner. If it cannot be promoted, progress through the raft index is tracked.
// If the learner does not make any progress in a reasonable amount of time, it is evicted from the cluster.
func ( e * ETCD ) trackLearnerProgress ( ctx context . Context , progress * learnerProgress , member * etcdserverpb . Member ) error {
// Try to promote it. If it can be promoted, no further tracking is necessary
if _ , err := e . client . MemberPromote ( ctx , member . ID ) ; err != nil {
logrus . Debugf ( "Unable to promote learner %s: %v" , member . Name , err )
} else {
logrus . Infof ( "Promoted learner %s" , member . Name )
return nil
}
now := time . Now ( )
// If this is the first time we've tracked this member's progress, reset stats
if progress . Name != member . Name || progress . ID != member . ID {
progress . ID = member . ID
progress . Name = member . Name
progress . RaftAppliedIndex = 0
progress . LastProgress . Time = now
}
// Update progress by retrieving status from the member's first reachable client URL
for _ , ep := range member . ClientURLs {
ctx , cancel := context . WithTimeout ( ctx , defaultDialTimeout )
defer cancel ( )
status , err := e . client . Status ( ctx , ep )
if err != nil {
logrus . Debugf ( "Failed to get etcd status from learner %s at %s: %v" , member . Name , ep , err )
continue
}
if progress . RaftAppliedIndex < status . RaftAppliedIndex {
logrus . Debugf ( "Learner %s has progressed from RaftAppliedIndex %d to %d" , progress . Name , progress . RaftAppliedIndex , status . RaftAppliedIndex )
progress . RaftAppliedIndex = status . RaftAppliedIndex
progress . LastProgress . Time = now
}
break
}
// Warn if the learner hasn't made any progress
if ! progress . LastProgress . Time . Equal ( now ) {
logrus . Warnf ( "Learner %s stalled at RaftAppliedIndex=%d for %s" , progress . Name , progress . RaftAppliedIndex , now . Sub ( progress . LastProgress . Time ) . String ( ) )
}
// See if it's time to evict yet
if now . Sub ( progress . LastProgress . Time ) > learnerMaxStallTime {
if _ , err := e . client . MemberRemove ( ctx , member . ID ) ; err != nil {
return err
}
logrus . Warnf ( "Removed learner %s from etcd cluster" , member . Name )
return nil
}
return e . setLearnerProgress ( ctx , progress )
}
// getLearnerProgress returns the stored learnerProgress struct as retrieved from etcd
func ( e * ETCD ) getLearnerProgress ( ctx context . Context ) ( * learnerProgress , error ) {
progress := & learnerProgress { }
value , err := e . client . Get ( ctx , learnerProgressKey )
if err != nil {
return nil , err
}
if value . Count < 1 {
return progress , nil
}
if err := json . NewDecoder ( bytes . NewBuffer ( value . Kvs [ 0 ] . Value ) ) . Decode ( progress ) ; err != nil {
return nil , err
}
return progress , nil
}
// setLearnerProgress stores the learnerProgress struct to etcd
func ( e * ETCD ) setLearnerProgress ( ctx context . Context , status * learnerProgress ) error {
w := & bytes . Buffer { }
if err := json . NewEncoder ( w ) . Encode ( status ) ; err != nil {
return err
}
_ , err := e . client . Put ( ctx , learnerProgressKey , w . String ( ) )
return err
}
2022-02-23 21:52:46 +00:00
// clearAlarms checks for any alarms on the local etcd member. If found, they are
// reported and the alarm state is cleared.
func ( e * ETCD ) clearAlarms ( ctx context . Context ) error {
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
defer cancel ( )
if e . client == nil {
return errors . New ( "etcd client was nil" )
}
alarmList , err := e . client . AlarmList ( ctx )
if err != nil {
return fmt . Errorf ( "etcd alarm list failed: %v" , err )
}
for _ , alarm := range alarmList . Alarms {
2022-03-25 18:52:40 +00:00
logrus . Warnf ( "Alarm on etcd member %d: %s" , alarm . MemberID , alarm . Alarm )
2022-02-23 21:52:46 +00:00
}
2022-03-25 18:52:40 +00:00
if len ( alarmList . Alarms ) > 0 {
2022-02-23 21:52:46 +00:00
if _ , err := e . client . AlarmDisarm ( ctx , & clientv3 . AlarmMember { } ) ; err != nil {
return fmt . Errorf ( "etcd alarm disarm failed: %v" , err )
}
logrus . Infof ( "Alarms disarmed on etcd server" )
}
return nil
}
2022-03-25 18:52:40 +00:00
func ( e * ETCD ) defragment ( ctx context . Context ) error {
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
defer cancel ( )
if e . client == nil {
return errors . New ( "etcd client was nil" )
}
logrus . Infof ( "Defragmenting etcd database" )
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-03-25 18:52:40 +00:00
_ , err := e . client . Defragment ( ctx , endpoints [ 0 ] )
return err
}
2022-02-24 22:35:08 +00:00
// clientURLs returns a list of all non-learner etcd cluster member client access URLs.
// The list is retrieved from the remote server that is being joined.
2021-03-01 21:50:50 +00:00
func ClientURLs ( ctx context . Context , clientAccessInfo * clientaccess . Info , selfIP string ) ( [ ] string , Members , error ) {
2020-07-29 20:52:49 +00:00
var memberList Members
2021-03-06 10:29:57 +00:00
resp , err := clientAccessInfo . Get ( "/db/info" )
2020-07-29 20:52:49 +00:00
if err != nil {
return nil , memberList , err
}
if err := json . Unmarshal ( resp , & memberList ) ; err != nil {
return nil , memberList , err
}
2023-03-24 22:19:44 +00:00
ip , err := getAdvertiseAddress ( selfIP )
2021-03-01 21:50:50 +00:00
if err != nil {
return nil , memberList , err
}
2020-07-29 20:52:49 +00:00
var clientURLs [ ] string
2021-03-01 21:50:50 +00:00
members :
2020-07-29 20:52:49 +00:00
for _ , member := range memberList . Members {
// excluding learner member from the client list
if member . IsLearner {
continue
}
2021-08-12 22:59:04 +00:00
for _ , clientURL := range member . ClientURLs {
u , err := url . Parse ( clientURL )
if err != nil {
continue
}
if u . Hostname ( ) == ip {
2021-03-01 21:50:50 +00:00
continue members
}
}
2020-07-29 20:52:49 +00:00
clientURLs = append ( clientURLs , member . ClientURLs ... )
}
return clientURLs , memberList , nil
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
2020-09-22 03:23:18 +00:00
// snapshotDir ensures that the snapshot directory exists, and then returns its path.
2021-08-09 16:04:18 +00:00
func snapshotDir ( config * config . Control , create bool ) ( string , error ) {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if config . EtcdSnapshotDir == "" {
// we have to create the snapshot dir if we are using
// the default snapshot dir if it doesn't exist
defaultSnapshotDir := filepath . Join ( config . DataDir , "db" , "snapshots" )
s , err := os . Stat ( defaultSnapshotDir )
if err != nil {
2021-08-09 16:04:18 +00:00
if create && os . IsNotExist ( err ) {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if err := os . MkdirAll ( defaultSnapshotDir , 0700 ) ; err != nil {
return "" , err
}
return defaultSnapshotDir , nil
}
return "" , err
}
if s . IsDir ( ) {
return defaultSnapshotDir , nil
}
}
return config . EtcdSnapshotDir , nil
}
2021-01-21 21:09:15 +00:00
// preSnapshotSetup checks to see if the necessary components are in place
// to perform an Etcd snapshot. This is necessary primarily for on-demand
// snapshots since they're performed before normal Etcd setup is completed.
func ( e * ETCD ) preSnapshotSetup ( ctx context . Context , config * config . Control ) error {
2022-07-09 01:27:05 +00:00
if e . snapshotSem == nil {
e . snapshotSem = semaphore . NewWeighted ( maxConcurrentSnapshots )
}
2021-01-21 21:09:15 +00:00
if e . client == nil {
if e . config == nil {
e . config = config
}
2022-04-12 16:59:47 +00:00
client , err := GetClient ( ctx , e . config )
2021-01-21 21:09:15 +00:00
if err != nil {
return err
}
e . client = client
2022-03-10 22:03:02 +00:00
go func ( ) {
<- ctx . Done ( )
e . client . Close ( )
} ( )
2021-01-21 21:09:15 +00:00
}
return nil
}
2022-01-14 17:31:22 +00:00
// compressSnapshot compresses the given snapshot and provides the
// caller with the path to the file.
func ( e * ETCD ) compressSnapshot ( snapshotDir , snapshotName , snapshotPath string ) ( string , error ) {
logrus . Info ( "Compressing etcd snapshot file: " + snapshotName )
zippedSnapshotName := snapshotName + compressedExtension
zipPath := filepath . Join ( snapshotDir , zippedSnapshotName )
zf , err := os . Create ( zipPath )
if err != nil {
return "" , err
}
defer zf . Close ( )
zipWriter := zip . NewWriter ( zf )
defer zipWriter . Close ( )
uncompressedPath := filepath . Join ( snapshotDir , snapshotName )
fileToZip , err := os . Open ( uncompressedPath )
if err != nil {
os . Remove ( zipPath )
return "" , err
}
defer fileToZip . Close ( )
info , err := fileToZip . Stat ( )
if err != nil {
os . Remove ( zipPath )
return "" , err
}
header , err := zip . FileInfoHeader ( info )
if err != nil {
os . Remove ( zipPath )
return "" , err
}
header . Name = snapshotName
header . Method = zip . Deflate
header . Modified = time . Now ( )
writer , err := zipWriter . CreateHeader ( header )
if err != nil {
os . Remove ( zipPath )
return "" , err
}
_ , err = io . Copy ( writer , fileToZip )
return zipPath , err
}
// decompressSnapshot decompresses the given snapshot and provides the caller
// with the full path to the uncompressed snapshot.
func ( e * ETCD ) decompressSnapshot ( snapshotDir , snapshotFile string ) ( string , error ) {
logrus . Info ( "Decompressing etcd snapshot file: " + snapshotFile )
r , err := zip . OpenReader ( snapshotFile )
if err != nil {
return "" , err
}
defer r . Close ( )
var decompressed * os . File
for _ , sf := range r . File {
decompressed , err = os . OpenFile ( strings . Replace ( sf . Name , compressedExtension , "" , - 1 ) , os . O_WRONLY | os . O_CREATE | os . O_TRUNC , sf . Mode ( ) )
if err != nil {
return "" , err
}
defer decompressed . Close ( )
ss , err := sf . Open ( )
if err != nil {
return "" , err
}
defer ss . Close ( )
if _ , err := io . Copy ( decompressed , ss ) ; err != nil {
os . Remove ( "" )
return "" , err
}
}
return decompressed . Name ( ) , nil
}
2021-11-29 18:30:04 +00:00
// Snapshot attempts to save a new snapshot to the configured directory, and then clean up any old and failed
2021-01-21 21:09:15 +00:00
// snapshots in excess of the retention limits. This method is used in the internal cron snapshot
// system as well as used to do on-demand snapshots.
func ( e * ETCD ) Snapshot ( ctx context . Context , config * config . Control ) error {
if err := e . preSnapshotSetup ( ctx , config ) ; err != nil {
return err
}
2022-07-09 01:27:05 +00:00
if ! e . snapshotSem . TryAcquire ( maxConcurrentSnapshots ) {
return fmt . Errorf ( "%d snapshots already in progress" , maxConcurrentSnapshots )
}
defer e . snapshotSem . Release ( maxConcurrentSnapshots )
2021-01-21 21:09:15 +00:00
2022-04-04 22:52:47 +00:00
// make sure the core.Factory is initialized before attempting to add snapshot metadata
2021-11-29 18:30:04 +00:00
var extraMetadata string
2022-04-04 22:52:47 +00:00
if e . config . Runtime . Core == nil {
logrus . Debugf ( "Cannot retrieve extra metadata from %s ConfigMap: runtime core not ready" , snapshotExtraMetadataConfigMapName )
2021-11-29 18:30:04 +00:00
} else {
2022-04-04 22:52:47 +00:00
logrus . Debugf ( "Attempting to retrieve extra metadata from %s ConfigMap" , snapshotExtraMetadataConfigMapName )
if snapshotExtraMetadataConfigMap , err := e . config . Runtime . Core . Core ( ) . V1 ( ) . ConfigMap ( ) . Get ( metav1 . NamespaceSystem , snapshotExtraMetadataConfigMapName , metav1 . GetOptions { } ) ; err != nil {
logrus . Debugf ( "Error encountered attempting to retrieve extra metadata from %s ConfigMap, error: %v" , snapshotExtraMetadataConfigMapName , err )
2021-11-29 18:30:04 +00:00
} else {
2022-04-04 22:52:47 +00:00
if m , err := json . Marshal ( snapshotExtraMetadataConfigMap . Data ) ; err != nil {
logrus . Debugf ( "Error attempting to marshal extra metadata contained in %s ConfigMap, error: %v" , snapshotExtraMetadataConfigMapName , err )
} else {
logrus . Debugf ( "Setting extra metadata from %s ConfigMap" , snapshotExtraMetadataConfigMapName )
logrus . Tracef ( "Marshalled extra metadata in %s ConfigMap was: %s" , snapshotExtraMetadataConfigMapName , string ( m ) )
extraMetadata = base64 . StdEncoding . EncodeToString ( m )
}
2021-11-29 18:30:04 +00:00
}
}
2022-04-12 16:59:47 +00:00
endpoints := getEndpoints ( e . config )
2022-02-24 22:35:08 +00:00
status , err := e . client . Status ( ctx , endpoints [ 0 ] )
2020-09-22 03:23:18 +00:00
if err != nil {
2021-01-21 21:09:15 +00:00
return errors . Wrap ( err , "failed to check etcd status for snapshot" )
2020-09-22 03:23:18 +00:00
}
if status . IsLearner {
2021-11-29 18:30:04 +00:00
logrus . Warnf ( "Unable to take snapshot: not supported for learner" )
2021-01-21 21:09:15 +00:00
return nil
2020-09-22 03:23:18 +00:00
}
2021-08-09 16:04:18 +00:00
snapshotDir , err := snapshotDir ( e . config , true )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if err != nil {
2021-01-21 21:09:15 +00:00
return errors . Wrap ( err , "failed to get the snapshot dir" )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-09-22 03:23:18 +00:00
2022-04-12 16:59:47 +00:00
cfg , err := getClientConfig ( ctx , e . config )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
if err != nil {
2021-01-21 21:09:15 +00:00
return errors . Wrap ( err , "failed to get config for etcd snapshot" )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2021-05-01 01:26:39 +00:00
nodeName := os . Getenv ( "NODE_NAME" )
2021-11-29 18:30:04 +00:00
now := time . Now ( )
snapshotName := fmt . Sprintf ( "%s-%s-%d" , e . config . EtcdSnapshotName , nodeName , now . Unix ( ) )
2021-01-21 21:09:15 +00:00
snapshotPath := filepath . Join ( snapshotDir , snapshotName )
2020-09-22 03:23:18 +00:00
logrus . Infof ( "Saving etcd snapshot to %s" , snapshotPath )
2022-01-14 17:31:22 +00:00
var sf * snapshotFile
2021-11-29 18:30:04 +00:00
2022-04-13 23:22:07 +00:00
lg , err := logutil . CreateDefaultZapLogger ( zap . InfoLevel )
if err != nil {
return err
}
if err := snapshot . NewV3 ( lg ) . Save ( ctx , * cfg , snapshotPath ) ; err != nil {
2022-01-14 17:31:22 +00:00
sf = & snapshotFile {
2021-11-29 18:30:04 +00:00
Name : snapshotName ,
Location : "" ,
Metadata : extraMetadata ,
NodeName : nodeName ,
CreatedAt : & metav1 . Time {
Time : now ,
} ,
2022-01-14 17:31:22 +00:00
Status : failedSnapshotStatus ,
Message : base64 . StdEncoding . EncodeToString ( [ ] byte ( err . Error ( ) ) ) ,
Size : 0 ,
Compressed : e . config . EtcdSnapshotCompress ,
2021-11-29 18:30:04 +00:00
}
logrus . Errorf ( "Failed to take etcd snapshot: %v" , err )
if err := e . addSnapshotData ( * sf ) ; err != nil {
return errors . Wrap ( err , "failed to save local snapshot failure data to configmap" )
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2021-01-21 21:09:15 +00:00
2022-01-14 17:31:22 +00:00
if e . config . EtcdSnapshotCompress {
zipPath , err := e . compressSnapshot ( snapshotDir , snapshotName , snapshotPath )
if err != nil {
return err
}
if err := os . Remove ( snapshotPath ) ; err != nil {
return err
}
snapshotPath = zipPath
logrus . Info ( "Compressed snapshot: " + snapshotPath )
}
2021-11-29 18:30:04 +00:00
// If the snapshot attempt was successful, sf will be nil as we did not set it.
if sf == nil {
f , err := os . Stat ( snapshotPath )
if err != nil {
return errors . Wrap ( err , "unable to retrieve snapshot information from local snapshot" )
2021-03-03 18:14:12 +00:00
}
2022-01-14 17:31:22 +00:00
sf = & snapshotFile {
2021-11-29 18:30:04 +00:00
Name : f . Name ( ) ,
Metadata : extraMetadata ,
Location : "file://" + snapshotPath ,
NodeName : nodeName ,
CreatedAt : & metav1 . Time {
Time : f . ModTime ( ) ,
} ,
2022-01-14 17:31:22 +00:00
Status : successfulSnapshotStatus ,
Size : f . Size ( ) ,
Compressed : e . config . EtcdSnapshotCompress ,
2021-03-03 18:14:12 +00:00
}
2021-11-29 18:30:04 +00:00
if err := e . addSnapshotData ( * sf ) ; err != nil {
return errors . Wrap ( err , "failed to save local snapshot data to configmap" )
2021-03-03 18:14:12 +00:00
}
2021-07-19 21:30:57 +00:00
if err := snapshotRetention ( e . config . EtcdSnapshotRetention , e . config . EtcdSnapshotName , snapshotDir ) ; err != nil {
2021-11-29 18:30:04 +00:00
return errors . Wrap ( err , "failed to apply local snapshot retention policy" )
}
if e . config . EtcdS3 {
logrus . Infof ( "Saving etcd snapshot %s to S3" , snapshotName )
// Set sf to nil so that we can attempt to now upload the snapshot to S3 if needed
sf = nil
if err := e . initS3IfNil ( ctx ) ; err != nil {
logrus . Warnf ( "Unable to initialize S3 client: %v" , err )
2022-01-14 17:31:22 +00:00
sf = & snapshotFile {
2021-11-29 18:30:04 +00:00
Name : filepath . Base ( snapshotPath ) ,
Metadata : extraMetadata ,
NodeName : "s3" ,
CreatedAt : & metav1 . Time {
Time : now ,
} ,
Message : base64 . StdEncoding . EncodeToString ( [ ] byte ( err . Error ( ) ) ) ,
Size : 0 ,
2022-01-14 17:31:22 +00:00
Status : failedSnapshotStatus ,
2021-11-29 18:30:04 +00:00
S3 : & s3Config {
Endpoint : e . config . EtcdS3Endpoint ,
EndpointCA : e . config . EtcdS3EndpointCA ,
SkipSSLVerify : e . config . EtcdS3SkipSSLVerify ,
Bucket : e . config . EtcdS3BucketName ,
Region : e . config . EtcdS3Region ,
Folder : e . config . EtcdS3Folder ,
Insecure : e . config . EtcdS3Insecure ,
} ,
}
}
// sf should be nil if we were able to successfully initialize the S3 client.
if sf == nil {
sf , err = e . s3 . upload ( ctx , snapshotPath , extraMetadata , now )
if err != nil {
return err
}
logrus . Infof ( "S3 upload complete for %s" , snapshotName )
if err := e . s3 . snapshotRetention ( ctx ) ; err != nil {
return errors . Wrap ( err , "failed to apply s3 snapshot retention policy" )
}
}
if err := e . addSnapshotData ( * sf ) ; err != nil {
return errors . Wrap ( err , "failed to save snapshot data to configmap" )
}
2021-01-21 21:09:15 +00:00
}
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2021-01-21 21:09:15 +00:00
2021-11-29 18:30:04 +00:00
return e . ReconcileSnapshotData ( ctx )
2021-05-01 01:26:39 +00:00
}
type s3Config struct {
Endpoint string ` json:"endpoint,omitempty" `
EndpointCA string ` json:"endpointCA,omitempty" `
SkipSSLVerify bool ` json:"skipSSLVerify,omitempty" `
Bucket string ` json:"bucket,omitempty" `
Region string ` json:"region,omitempty" `
Folder string ` json:"folder,omitempty" `
2021-09-05 15:56:15 +00:00
Insecure bool ` json:"insecure,omitempty" `
2021-05-01 01:26:39 +00:00
}
2022-01-14 17:31:22 +00:00
type snapshotStatus string
2021-11-29 18:30:04 +00:00
2022-01-14 17:31:22 +00:00
const (
successfulSnapshotStatus snapshotStatus = "successful"
failedSnapshotStatus snapshotStatus = "failed"
)
2021-11-29 18:30:04 +00:00
2022-01-14 17:31:22 +00:00
// snapshotFile represents a single snapshot and it's
2021-05-01 01:26:39 +00:00
// metadata.
2022-01-14 17:31:22 +00:00
type snapshotFile struct {
2021-05-01 01:26:39 +00:00
Name string ` json:"name" `
// Location contains the full path of the snapshot. For
// local paths, the location will be prefixed with "file://".
2022-01-14 17:31:22 +00:00
Location string ` json:"location,omitempty" `
Metadata string ` json:"metadata,omitempty" `
Message string ` json:"message,omitempty" `
NodeName string ` json:"nodeName,omitempty" `
CreatedAt * metav1 . Time ` json:"createdAt,omitempty" `
Size int64 ` json:"size,omitempty" `
Status snapshotStatus ` json:"status,omitempty" `
S3 * s3Config ` json:"s3Config,omitempty" `
Compressed bool ` json:"compressed" `
2021-11-29 18:30:04 +00:00
}
// listLocalSnapshots provides a list of the currently stored
// snapshots on disk along with their relevant
// metadata.
2022-01-14 17:31:22 +00:00
func ( e * ETCD ) listLocalSnapshots ( ) ( map [ string ] snapshotFile , error ) {
snapshots := make ( map [ string ] snapshotFile )
2021-11-29 18:30:04 +00:00
snapshotDir , err := snapshotDir ( e . config , true )
if err != nil {
return snapshots , errors . Wrap ( err , "failed to get the snapshot dir" )
}
2022-10-08 00:36:57 +00:00
dirEntries , err := os . ReadDir ( snapshotDir )
2021-11-29 18:30:04 +00:00
if err != nil {
return nil , err
}
nodeName := os . Getenv ( "NODE_NAME" )
2022-10-08 00:36:57 +00:00
for _ , de := range dirEntries {
file , err := de . Info ( )
if err != nil {
return nil , err
}
2022-01-14 17:31:22 +00:00
sf := snapshotFile {
2022-10-08 00:36:57 +00:00
Name : file . Name ( ) ,
Location : "file://" + filepath . Join ( snapshotDir , file . Name ( ) ) ,
2021-11-29 18:30:04 +00:00
NodeName : nodeName ,
CreatedAt : & metav1 . Time {
2022-10-08 00:36:57 +00:00
Time : file . ModTime ( ) ,
2021-11-29 18:30:04 +00:00
} ,
2022-10-08 00:36:57 +00:00
Size : file . Size ( ) ,
2022-01-14 17:31:22 +00:00
Status : successfulSnapshotStatus ,
2021-11-29 18:30:04 +00:00
}
sfKey := generateSnapshotConfigMapKey ( sf )
snapshots [ sfKey ] = sf
}
return snapshots , nil
2021-05-01 01:26:39 +00:00
}
2021-11-29 18:30:04 +00:00
// listS3Snapshots provides a list of currently stored
// snapshots in S3 along with their relevant
2021-05-01 01:26:39 +00:00
// metadata.
2022-01-14 17:31:22 +00:00
func ( e * ETCD ) listS3Snapshots ( ctx context . Context ) ( map [ string ] snapshotFile , error ) {
snapshots := make ( map [ string ] snapshotFile )
2021-05-01 01:26:39 +00:00
if e . config . EtcdS3 {
ctx , cancel := context . WithCancel ( ctx )
defer cancel ( )
2021-05-07 23:10:04 +00:00
if err := e . initS3IfNil ( ctx ) ; err != nil {
return nil , err
2021-05-01 01:26:39 +00:00
}
2021-08-09 23:14:41 +00:00
var loo minio . ListObjectsOptions
if e . config . EtcdS3Folder != "" {
loo = minio . ListObjectsOptions {
Prefix : e . config . EtcdS3Folder ,
Recursive : true ,
}
}
objects := e . s3 . client . ListObjects ( ctx , e . config . EtcdS3BucketName , loo )
2021-05-01 01:26:39 +00:00
for obj := range objects {
if obj . Err != nil {
return nil , obj . Err
}
if obj . Size == 0 {
continue
}
ca , err := time . Parse ( time . RFC3339 , obj . LastModified . Format ( time . RFC3339 ) )
if err != nil {
return nil , err
}
2022-01-14 17:31:22 +00:00
sf := snapshotFile {
2021-05-01 01:26:39 +00:00
Name : filepath . Base ( obj . Key ) ,
NodeName : "s3" ,
CreatedAt : & metav1 . Time {
Time : ca ,
} ,
Size : obj . Size ,
S3 : & s3Config {
Endpoint : e . config . EtcdS3Endpoint ,
EndpointCA : e . config . EtcdS3EndpointCA ,
SkipSSLVerify : e . config . EtcdS3SkipSSLVerify ,
Bucket : e . config . EtcdS3BucketName ,
Region : e . config . EtcdS3Region ,
Folder : e . config . EtcdS3Folder ,
2021-09-05 15:56:15 +00:00
Insecure : e . config . EtcdS3Insecure ,
2021-05-01 01:26:39 +00:00
} ,
2022-01-14 17:31:22 +00:00
Status : successfulSnapshotStatus ,
2021-11-29 18:30:04 +00:00
}
sfKey := generateSnapshotConfigMapKey ( sf )
snapshots [ sfKey ] = sf
2021-05-01 01:26:39 +00:00
}
}
return snapshots , nil
}
2021-05-07 23:10:04 +00:00
// initS3IfNil initializes the S3 client
// if it hasn't yet been initialized.
func ( e * ETCD ) initS3IfNil ( ctx context . Context ) error {
if e . s3 == nil {
2021-06-30 20:29:03 +00:00
s3 , err := NewS3 ( ctx , e . config )
2021-05-07 23:10:04 +00:00
if err != nil {
return err
}
e . s3 = s3
}
return nil
}
2021-11-29 18:30:04 +00:00
// PruneSnapshots performs a retention run with the given
2021-05-13 20:36:33 +00:00
// retention duration and removes expired snapshots.
func ( e * ETCD ) PruneSnapshots ( ctx context . Context ) error {
2021-08-09 16:04:18 +00:00
snapshotDir , err := snapshotDir ( e . config , false )
2021-05-13 20:36:33 +00:00
if err != nil {
return errors . Wrap ( err , "failed to get the snapshot dir" )
}
2021-11-29 18:30:04 +00:00
if err := snapshotRetention ( e . config . EtcdSnapshotRetention , e . config . EtcdSnapshotName , snapshotDir ) ; err != nil {
logrus . Errorf ( "Error applying snapshot retention policy: %v" , err )
}
2021-05-13 20:36:33 +00:00
2021-05-18 20:57:40 +00:00
if e . config . EtcdS3 {
2021-11-29 18:30:04 +00:00
if err := e . initS3IfNil ( ctx ) ; err != nil {
logrus . Warnf ( "Unable to initialize S3 client during prune: %v" , err )
} else {
if err := e . s3 . snapshotRetention ( ctx ) ; err != nil {
logrus . Errorf ( "Error applying S3 snapshot retention policy: %v" , err )
}
2021-05-18 20:57:40 +00:00
}
}
2021-11-29 18:30:04 +00:00
return e . ReconcileSnapshotData ( ctx )
2021-05-13 20:36:33 +00:00
}
2021-05-11 23:59:33 +00:00
// ListSnapshots is an exported wrapper method that wraps an
// unexported method of the same name.
2022-01-14 17:31:22 +00:00
func ( e * ETCD ) ListSnapshots ( ctx context . Context ) ( map [ string ] snapshotFile , error ) {
2021-11-29 18:30:04 +00:00
if e . config . EtcdS3 {
return e . listS3Snapshots ( ctx )
2021-05-11 23:59:33 +00:00
}
2021-11-29 18:30:04 +00:00
return e . listLocalSnapshots ( )
2021-05-11 23:59:33 +00:00
}
2021-05-07 23:10:04 +00:00
// deleteSnapshots removes the given snapshots from
// either local storage or S3.
func ( e * ETCD ) DeleteSnapshots ( ctx context . Context , snapshots [ ] string ) error {
2021-08-09 16:04:18 +00:00
snapshotDir , err := snapshotDir ( e . config , false )
2021-05-07 23:10:04 +00:00
if err != nil {
return errors . Wrap ( err , "failed to get the snapshot dir" )
}
if e . config . EtcdS3 {
logrus . Info ( "Removing the given etcd snapshot(s) from S3" )
logrus . Debugf ( "Removing the given etcd snapshot(s) from S3: %v" , snapshots )
if e . initS3IfNil ( ctx ) ; err != nil {
return err
}
objectsCh := make ( chan minio . ObjectInfo )
2021-10-15 17:24:14 +00:00
ctx , cancel := context . WithTimeout ( ctx , e . config . EtcdS3Timeout )
2021-05-07 23:10:04 +00:00
defer cancel ( )
go func ( ) {
2021-05-19 02:58:30 +00:00
defer close ( objectsCh )
2021-05-07 23:10:04 +00:00
opts := minio . ListObjectsOptions {
Recursive : true ,
}
for obj := range e . s3 . client . ListObjects ( ctx , e . config . EtcdS3BucketName , opts ) {
if obj . Err != nil {
logrus . Error ( obj . Err )
return
}
// iterate through the given snapshots and only
// add them to the channel for remove if they're
// actually found from the bucket listing.
for _ , snapshot := range snapshots {
if snapshot == obj . Key {
objectsCh <- obj
}
}
}
} ( )
for {
select {
case <- ctx . Done ( ) :
logrus . Errorf ( "Unable to delete snapshot: %v" , ctx . Err ( ) )
2021-11-29 18:30:04 +00:00
return e . ReconcileSnapshotData ( ctx )
2021-05-07 23:10:04 +00:00
case <- time . After ( time . Millisecond * 100 ) :
continue
case err , ok := <- e . s3 . client . RemoveObjects ( ctx , e . config . EtcdS3BucketName , objectsCh , minio . RemoveObjectsOptions { } ) :
if err . Err != nil {
logrus . Errorf ( "Unable to delete snapshot: %v" , err . Err )
}
if ! ok {
2021-11-29 18:30:04 +00:00
return e . ReconcileSnapshotData ( ctx )
2021-05-07 23:10:04 +00:00
}
}
}
}
logrus . Info ( "Removing the given locally stored etcd snapshot(s)" )
2021-07-26 16:59:33 +00:00
logrus . Debugf ( "Attempting to remove the given locally stored etcd snapshot(s): %v" , snapshots )
2021-05-07 23:10:04 +00:00
for _ , s := range snapshots {
// check if the given snapshot exists. If it does,
// remove it, otherwise continue.
sf := filepath . Join ( snapshotDir , s )
if _ , err := os . Stat ( sf ) ; os . IsNotExist ( err ) {
2021-07-26 16:59:33 +00:00
logrus . Infof ( "Snapshot %s, does not exist" , s )
2021-05-07 23:10:04 +00:00
continue
}
if err := os . Remove ( sf ) ; err != nil {
return err
}
2021-07-26 16:59:33 +00:00
logrus . Debug ( "Removed snapshot " , s )
2021-05-07 23:10:04 +00:00
}
2021-11-29 18:30:04 +00:00
return e . ReconcileSnapshotData ( ctx )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2021-11-29 18:30:04 +00:00
// AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata
// available at the time.
2022-01-14 17:31:22 +00:00
func ( e * ETCD ) addSnapshotData ( sf snapshotFile ) error {
2023-01-10 18:51:39 +00:00
return retry . OnError ( snapshotDataBackoff , func ( err error ) bool {
2021-05-01 01:26:39 +00:00
return apierrors . IsConflict ( err ) || apierrors . IsAlreadyExists ( err )
} , func ( ) error {
2021-11-29 18:30:04 +00:00
// make sure the core.Factory is initialized. There can
2021-05-01 01:26:39 +00:00
// be a race between this core code startup.
for e . config . Runtime . Core == nil {
runtime . Gosched ( )
}
snapshotConfigMap , getErr := e . config . Runtime . Core . Core ( ) . V1 ( ) . ConfigMap ( ) . Get ( metav1 . NamespaceSystem , snapshotConfigMapName , metav1 . GetOptions { } )
2022-06-14 22:12:28 +00:00
sfKey := generateSnapshotConfigMapKey ( sf )
2021-11-29 18:30:04 +00:00
marshalledSnapshotFile , err := json . Marshal ( sf )
2021-05-01 01:26:39 +00:00
if err != nil {
return err
}
if apierrors . IsNotFound ( getErr ) {
cm := v1 . ConfigMap {
ObjectMeta : metav1 . ObjectMeta {
Name : snapshotConfigMapName ,
Namespace : metav1 . NamespaceSystem ,
} ,
2022-06-14 22:12:28 +00:00
Data : map [ string ] string { sfKey : string ( marshalledSnapshotFile ) } ,
2021-05-01 01:26:39 +00:00
}
_ , err := e . config . Runtime . Core . Core ( ) . V1 ( ) . ConfigMap ( ) . Create ( & cm )
return err
}
if snapshotConfigMap . Data == nil {
2021-05-11 23:59:33 +00:00
snapshotConfigMap . Data = make ( map [ string ] string )
2021-05-01 01:26:39 +00:00
}
2021-11-29 18:30:04 +00:00
snapshotConfigMap . Data [ sfKey ] = string ( marshalledSnapshotFile )
_ , err = e . config . Runtime . Core . Core ( ) . V1 ( ) . ConfigMap ( ) . Update ( snapshotConfigMap )
return err
} )
}
2022-01-14 17:31:22 +00:00
func generateSnapshotConfigMapKey ( sf snapshotFile ) string {
2022-06-14 22:12:28 +00:00
name := invalidKeyChars . ReplaceAllString ( sf . Name , "_" )
2021-11-29 18:30:04 +00:00
if sf . NodeName == "s3" {
2022-06-14 22:12:28 +00:00
return "s3-" + name
2021-11-29 18:30:04 +00:00
}
2022-06-14 22:12:28 +00:00
return "local-" + name
2021-11-29 18:30:04 +00:00
}
// ReconcileSnapshotData reconciles snapshot data in the snapshot ConfigMap.
// It will reconcile snapshot data from disk locally always, and if S3 is enabled, will attempt to list S3 snapshots
// and reconcile snapshots from S3. Notably,
func ( e * ETCD ) ReconcileSnapshotData ( ctx context . Context ) error {
logrus . Infof ( "Reconciling etcd snapshot data in %s ConfigMap" , snapshotConfigMapName )
defer logrus . Infof ( "Reconciliation of snapshot data in %s ConfigMap complete" , snapshotConfigMapName )
return retry . OnError ( retry . DefaultBackoff , func ( err error ) bool {
return apierrors . IsConflict ( err ) || apierrors . IsAlreadyExists ( err )
} , func ( ) error {
// make sure the core.Factory is initialize. There can
// be a race between this core code startup.
for e . config . Runtime . Core == nil {
runtime . Gosched ( )
}
logrus . Debug ( "core.Factory is initialized" )
snapshotConfigMap , getErr := e . config . Runtime . Core . Core ( ) . V1 ( ) . ConfigMap ( ) . Get ( metav1 . NamespaceSystem , snapshotConfigMapName , metav1 . GetOptions { } )
if apierrors . IsNotFound ( getErr ) {
// Can't reconcile what doesn't exist.
return errors . New ( "No snapshot configmap found" )
}
logrus . Debugf ( "Attempting to reconcile etcd snapshot data for configmap generation %d" , snapshotConfigMap . Generation )
// if the snapshot config map data is nil, no need to reconcile.
if snapshotConfigMap . Data == nil {
return nil
}
snapshotFiles , err := e . listLocalSnapshots ( )
if err != nil {
return err
}
// s3ListSuccessful is set to true if we are successful at listing snapshots from S3 to eliminate accidental
// clobbering of S3 snapshots in the configmap due to misconfigured S3 credentials/details
s3ListSuccessful := false
if e . config . EtcdS3 {
if s3Snapshots , err := e . listS3Snapshots ( ctx ) ; err != nil {
logrus . Errorf ( "error retrieving S3 snapshots for reconciliation: %v" , err )
} else {
for k , v := range s3Snapshots {
snapshotFiles [ k ] = v
}
s3ListSuccessful = true
}
}
2021-05-04 17:03:28 +00:00
nodeName := os . Getenv ( "NODE_NAME" )
2021-11-29 18:30:04 +00:00
// deletedSnapshots is a map[string]string where key is the configmap key and the value is the marshalled snapshot file
// it will be populated below with snapshots that are either from S3 or on the local node. Notably, deletedSnapshots will
// not contain snapshots that are in the "failed" status
deletedSnapshots := make ( map [ string ] string )
// failedSnapshots is a slice of unmarshaled snapshot files sourced from the configmap
// These are stored unmarshaled so we can sort based on name.
2022-01-14 17:31:22 +00:00
var failedSnapshots [ ] snapshotFile
var failedS3Snapshots [ ] snapshotFile
2021-11-29 18:30:04 +00:00
// remove entries for this node and s3 (if S3 is enabled) only
2021-05-01 01:26:39 +00:00
for k , v := range snapshotConfigMap . Data {
2022-01-14 17:31:22 +00:00
var sf snapshotFile
2021-05-01 01:26:39 +00:00
if err := json . Unmarshal ( [ ] byte ( v ) , & sf ) ; err != nil {
return err
}
2022-01-14 17:31:22 +00:00
if ( sf . NodeName == nodeName || ( sf . NodeName == "s3" && s3ListSuccessful ) ) && sf . Status != failedSnapshotStatus {
2021-11-29 18:30:04 +00:00
// Only delete the snapshot if the snapshot was not failed
// sf.Status != FailedSnapshotStatus is intentional, as it is possible we are reconciling snapshots stored from older versions that did not set status
deletedSnapshots [ generateSnapshotConfigMapKey ( sf ) ] = v // store a copy of the snapshot
delete ( snapshotConfigMap . Data , k )
2022-01-14 17:31:22 +00:00
} else if sf . Status == failedSnapshotStatus && sf . NodeName == nodeName && e . config . EtcdSnapshotRetention >= 1 {
2021-11-29 18:30:04 +00:00
// Handle locally failed snapshots.
failedSnapshots = append ( failedSnapshots , sf )
delete ( snapshotConfigMap . Data , k )
2022-01-14 17:31:22 +00:00
} else if sf . Status == failedSnapshotStatus && e . config . EtcdS3 && sf . NodeName == "s3" && strings . HasPrefix ( sf . Name , e . config . EtcdSnapshotName + "-" + nodeName ) && e . config . EtcdSnapshotRetention >= 1 {
2021-11-29 18:30:04 +00:00
// If we're operating against S3, we can clean up failed S3 snapshots that failed on this node.
failedS3Snapshots = append ( failedS3Snapshots , sf )
2021-05-01 01:26:39 +00:00
delete ( snapshotConfigMap . Data , k )
}
}
2021-11-29 18:30:04 +00:00
// Apply the failed snapshot retention policy to locally failed snapshots
if len ( failedSnapshots ) > 0 && e . config . EtcdSnapshotRetention >= 1 {
sort . Slice ( failedSnapshots , func ( i , j int ) bool {
return failedSnapshots [ i ] . Name > failedSnapshots [ j ] . Name
} )
var keepCount int
if e . config . EtcdSnapshotRetention >= len ( failedSnapshots ) {
keepCount = len ( failedSnapshots )
} else {
keepCount = e . config . EtcdSnapshotRetention
}
for _ , dfs := range failedSnapshots [ : keepCount ] {
sfKey := generateSnapshotConfigMapKey ( dfs )
marshalledSnapshot , err := json . Marshal ( dfs )
if err != nil {
logrus . Errorf ( "unable to marshal snapshot to store in configmap %v" , err )
} else {
snapshotConfigMap . Data [ sfKey ] = string ( marshalledSnapshot )
}
}
2021-05-01 01:26:39 +00:00
}
2021-11-29 18:30:04 +00:00
// Apply the failed snapshot retention policy to the S3 snapshots
if len ( failedS3Snapshots ) > 0 && e . config . EtcdSnapshotRetention >= 1 {
sort . Slice ( failedS3Snapshots , func ( i , j int ) bool {
return failedS3Snapshots [ i ] . Name > failedS3Snapshots [ j ] . Name
} )
var keepCount int
if e . config . EtcdSnapshotRetention >= len ( failedS3Snapshots ) {
keepCount = len ( failedS3Snapshots )
} else {
keepCount = e . config . EtcdSnapshotRetention
}
for _ , dfs := range failedS3Snapshots [ : keepCount ] {
sfKey := generateSnapshotConfigMapKey ( dfs )
marshalledSnapshot , err := json . Marshal ( dfs )
if err != nil {
logrus . Errorf ( "unable to marshal snapshot to store in configmap %v" , err )
} else {
snapshotConfigMap . Data [ sfKey ] = string ( marshalledSnapshot )
}
}
}
// save the local entries to the ConfigMap if they are still on disk or in S3.
for _ , snapshot := range snapshotFiles {
2022-01-14 17:31:22 +00:00
var sf snapshotFile
2021-11-29 18:30:04 +00:00
sfKey := generateSnapshotConfigMapKey ( snapshot )
if v , ok := deletedSnapshots [ sfKey ] ; ok {
// use the snapshot file we have from the existing configmap, and unmarshal it so we can manipulate it
if err := json . Unmarshal ( [ ] byte ( v ) , & sf ) ; err != nil {
logrus . Errorf ( "error unmarshaling snapshot file: %v" , err )
// use the snapshot with info we sourced from disk/S3 (will be missing metadata, but something is better than nothing)
sf = snapshot
}
} else {
sf = snapshot
}
2022-01-14 17:31:22 +00:00
sf . Status = successfulSnapshotStatus // if the snapshot is on disk or in S3, it was successful.
2021-11-29 18:30:04 +00:00
marshalledSnapshot , err := json . Marshal ( sf )
if err != nil {
logrus . Warnf ( "unable to marshal snapshot metadata %s to store in configmap, received error: %v" , sf . Name , err )
} else {
snapshotConfigMap . Data [ sfKey ] = string ( marshalledSnapshot )
}
}
logrus . Debugf ( "Updating snapshot ConfigMap (%s) with %d entries" , snapshotConfigMapName , len ( snapshotConfigMap . Data ) )
2021-05-01 01:26:39 +00:00
_ , err = e . config . Runtime . Core . Core ( ) . V1 ( ) . ConfigMap ( ) . Update ( snapshotConfigMap )
return err
} )
}
// setSnapshotFunction schedules snapshots at the configured interval.
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
func ( e * ETCD ) setSnapshotFunction ( ctx context . Context ) {
2023-01-10 18:51:39 +00:00
skipJob := cron . SkipIfStillRunning ( cronLogger )
e . cron . AddJob ( e . config . EtcdSnapshotCron , skipJob ( cron . FuncJob ( func ( ) {
// Add a small amount of jitter to the actual snapshot execution. On clusters with multiple servers,
// having all the nodes take a snapshot at the exact same time can lead to excessive retry thrashing
// when updating the snapshot list configmap.
time . Sleep ( time . Duration ( rand . Float64 ( ) * float64 ( snapshotJitterMax ) ) )
2021-01-21 21:09:15 +00:00
if err := e . Snapshot ( ctx , e . config ) ; err != nil {
logrus . Error ( err )
}
2023-01-10 18:51:39 +00:00
} ) ) )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
// Restore performs a restore of the ETCD datastore from
// the given snapshot path. This operation exists upon
// completion.
func ( e * ETCD ) Restore ( ctx context . Context ) error {
// check the old etcd data dir
2021-11-10 12:33:42 +00:00
oldDataDir := DBDir ( e . config ) + "-old-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
2020-09-30 00:53:31 +00:00
if e . config . ClusterResetRestorePath == "" {
return errors . New ( "no etcd restore path was specified" )
}
// make sure snapshot exists before restoration
if _ , err := os . Stat ( e . config . ClusterResetRestorePath ) ; err != nil {
return err
}
2022-01-14 17:31:22 +00:00
var restorePath string
if strings . HasSuffix ( e . config . ClusterResetRestorePath , compressedExtension ) {
snapshotDir , err := snapshotDir ( e . config , true )
if err != nil {
return errors . Wrap ( err , "failed to get the snapshot dir" )
}
decompressSnapshot , err := e . decompressSnapshot ( snapshotDir , e . config . ClusterResetRestorePath )
if err != nil {
return err
}
restorePath = decompressSnapshot
} else {
restorePath = e . config . ClusterResetRestorePath
}
2020-09-30 00:53:31 +00:00
// move the data directory to a temp path
2021-11-10 12:33:42 +00:00
if err := os . Rename ( DBDir ( e . config ) , oldDataDir ) ; err != nil {
2020-09-30 00:53:31 +00:00
return err
}
2022-01-14 17:31:22 +00:00
2020-09-30 00:53:31 +00:00
logrus . Infof ( "Pre-restore etcd database moved to %s" , oldDataDir )
2022-01-14 17:31:22 +00:00
2022-04-13 23:22:07 +00:00
lg , err := logutil . CreateDefaultZapLogger ( zap . InfoLevel )
if err != nil {
return err
}
return snapshot . NewV3 ( lg ) . Restore ( snapshot . RestoreConfig {
2022-01-14 17:31:22 +00:00
SnapshotPath : restorePath ,
2020-09-30 00:53:31 +00:00
Name : e . name ,
2021-11-10 12:33:42 +00:00
OutputDataDir : DBDir ( e . config ) ,
2020-09-30 00:53:31 +00:00
OutputWALDir : walDir ( e . config ) ,
PeerURLs : [ ] string { e . peerURL ( ) } ,
InitialCluster : e . name + "=" + e . peerURL ( ) ,
2021-07-03 11:24:58 +00:00
} )
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
// snapshotRetention iterates through the snapshots and removes the oldest
// leaving the desired number of snapshots.
2021-07-19 21:30:57 +00:00
func snapshotRetention ( retention int , snapshotPrefix string , snapshotDir string ) error {
2021-11-29 18:30:04 +00:00
if retention < 1 {
return nil
}
2021-05-01 01:26:39 +00:00
nodeName := os . Getenv ( "NODE_NAME" )
2021-11-29 18:30:04 +00:00
logrus . Infof ( "Applying local snapshot retention policy: retention: %d, snapshotPrefix: %s, directory: %s" , retention , snapshotPrefix + "-" + nodeName , snapshotDir )
2021-05-01 01:26:39 +00:00
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
var snapshotFiles [ ] os . FileInfo
if err := filepath . Walk ( snapshotDir , func ( path string , info os . FileInfo , err error ) error {
if err != nil {
return err
}
2021-07-19 21:30:57 +00:00
if strings . HasPrefix ( info . Name ( ) , snapshotPrefix + "-" + nodeName ) {
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
snapshotFiles = append ( snapshotFiles , info )
}
return nil
} ) ; err != nil {
return err
}
if len ( snapshotFiles ) <= retention {
return nil
}
sort . Slice ( snapshotFiles , func ( i , j int ) bool {
return snapshotFiles [ i ] . Name ( ) < snapshotFiles [ j ] . Name ( )
} )
2021-03-03 18:14:12 +00:00
delCount := len ( snapshotFiles ) - retention
for _ , df := range snapshotFiles [ : delCount ] {
2021-11-29 18:30:04 +00:00
snapshotPath := filepath . Join ( snapshotDir , df . Name ( ) )
logrus . Infof ( "Removing local snapshot %s" , snapshotPath )
if err := os . Remove ( snapshotPath ) ; err != nil {
2021-03-03 18:14:12 +00:00
return err
}
}
return nil
Galal hussein etcd backup restore (#2154)
* Add etcd snapshot and restore
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix error logs
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* goimports
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* fix flag describtion
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* Add disable snapshot and retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* use creation time for snapshot retention
Signed-off-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
* unexport method, update var name
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* adjust snapshot flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var name, string concat
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* revert previous change, create constants
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* type assertion error checking
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* pr remediation
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* updates
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* simplify logic, remove unneeded function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update flags
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add comment
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* exit on restore completion, update flag names, move retention check
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update disable snapshots flag and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* move function
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update var and field names
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update defaultSnapshotIntervalMinutes to 12 like rke
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update directory perms
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update etc-snapshot-dir usage
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update interval to 12 hours
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* fix usage typo
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* add cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* wire in cron
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update deps target to work, add build/data target for creation, and generate
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove dead make targets
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* error handling, cluster reset functionality
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* update
Signed-off-by: Brian Downs <brian.downs@gmail.com>
* remove intermediate dapper file
Signed-off-by: Brian Downs <brian.downs@gmail.com>
Co-authored-by: galal-hussein <hussein.galal.ahmed.11@gmail.com>
2020-08-28 23:57:40 +00:00
}
2020-12-07 20:30:44 +00:00
// backupDirWithRetention will move the dir to a backup dir
// and will keep only maxBackupRetention of dirs.
func backupDirWithRetention ( dir string , maxBackupRetention int ) ( string , error ) {
backupDir := dir + "-backup-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
if _ , err := os . Stat ( dir ) ; err != nil {
return "" , nil
}
2022-10-08 00:36:57 +00:00
entries , err := os . ReadDir ( filepath . Dir ( dir ) )
if err != nil {
return "" , err
}
files := make ( [ ] fs . FileInfo , 0 , len ( entries ) )
for _ , entry := range entries {
info , err := entry . Info ( )
if err != nil {
return "" , err
}
files = append ( files , info )
}
2020-12-07 20:30:44 +00:00
if err != nil {
return "" , err
}
sort . Slice ( files , func ( i , j int ) bool {
return files [ i ] . ModTime ( ) . After ( files [ j ] . ModTime ( ) )
} )
count := 0
for _ , f := range files {
if strings . HasPrefix ( f . Name ( ) , filepath . Base ( dir ) + "-backup" ) && f . IsDir ( ) {
count ++
if count > maxBackupRetention {
if err := os . RemoveAll ( filepath . Join ( filepath . Dir ( dir ) , f . Name ( ) ) ) ; err != nil {
return "" , err
}
}
}
}
// move the directory to a temp path
if err := os . Rename ( dir , backupDir ) ; err != nil {
return "" , err
}
return backupDir , nil
}
2021-02-12 15:35:57 +00:00
2022-02-16 22:19:58 +00:00
// GetAPIServerURLsFromETCD will try to fetch the version.Program/apiaddresses key from etcd
2022-02-24 22:35:08 +00:00
// and unmarshal it to a list of apiserver endpoints.
2022-02-16 22:19:58 +00:00
func GetAPIServerURLsFromETCD ( ctx context . Context , cfg * config . Control ) ( [ ] string , error ) {
2022-04-12 16:59:47 +00:00
cl , err := GetClient ( ctx , cfg )
2021-02-12 15:35:57 +00:00
if err != nil {
2022-02-16 22:19:58 +00:00
return nil , err
2021-02-12 15:35:57 +00:00
}
2022-03-10 22:03:02 +00:00
defer cl . Close ( )
2021-02-12 15:35:57 +00:00
etcdResp , err := cl . KV . Get ( ctx , AddressKey )
if err != nil {
2022-02-16 22:19:58 +00:00
return nil , err
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
if etcdResp . Count == 0 || len ( etcdResp . Kvs [ 0 ] . Value ) == 0 {
return nil , ErrAddressNotSet
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
2021-02-12 15:35:57 +00:00
var addresses [ ] string
if err := json . Unmarshal ( etcdResp . Kvs [ 0 ] . Value , & addresses ) ; err != nil {
2022-02-16 22:19:58 +00:00
return nil , fmt . Errorf ( "failed to unmarshal apiserver addresses from etcd: %v" , err )
2021-02-12 15:35:57 +00:00
}
2022-02-16 22:19:58 +00:00
return addresses , nil
2021-02-12 15:35:57 +00:00
}
// GetMembersClientURLs will list through the member lists in etcd and return
// back a combined list of client urls for each member in the cluster
func ( e * ETCD ) GetMembersClientURLs ( ctx context . Context ) ( [ ] string , error ) {
2023-02-13 20:00:52 +00:00
return e . client . Endpoints ( ) , nil
2021-02-12 15:35:57 +00:00
}
2021-03-01 21:50:50 +00:00
2021-09-14 15:20:38 +00:00
// GetMembersNames will list through the member lists in etcd and return
// back a combined list of member names
func ( e * ETCD ) GetMembersNames ( ctx context . Context ) ( [ ] string , error ) {
ctx , cancel := context . WithTimeout ( ctx , testTimeout )
defer cancel ( )
members , err := e . client . MemberList ( ctx )
if err != nil {
return nil , err
}
var memberNames [ ] string
for _ , member := range members . Members {
memberNames = append ( memberNames , member . Name )
}
return memberNames , nil
}
2021-03-01 21:50:50 +00:00
// RemoveSelf will remove the member if it exists in the cluster
func ( e * ETCD ) RemoveSelf ( ctx context . Context ) error {
2021-09-14 15:20:38 +00:00
if err := e . RemovePeer ( ctx , e . name , e . address , true ) ; err != nil {
2021-03-16 16:14:43 +00:00
return err
}
// backup the data dir to avoid issues when re-enabling etcd
2021-11-10 12:33:42 +00:00
oldDataDir := DBDir ( e . config ) + "-old-" + strconv . Itoa ( int ( time . Now ( ) . Unix ( ) ) )
2021-03-16 16:14:43 +00:00
// move the data directory to a temp path
2021-11-10 12:33:42 +00:00
return os . Rename ( DBDir ( e . config ) , oldDataDir )
2021-03-01 21:50:50 +00:00
}