You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
consul/internal/storage/raft/backend.go

351 lines
11 KiB

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package raft
import (
"context"
"fmt"
"net"
"strconv"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/consul/internal/storage"
"github.com/hashicorp/consul/internal/storage/inmem"
"github.com/hashicorp/consul/proto-public/pbresource"
pbstorage "github.com/hashicorp/consul/proto/private/pbstorage"
)
// NewBackend returns a storage backend that uses Raft for durable persistence
// and serves reads from an in-memory database. It's suitable for production use.
//
// It's not an entirely clean abstraction because rather than owning the Raft
// subsystem directly, it has to integrate with the existing FSM and related
// machinery from before generic resources.
//
// The given Handle will be used to apply logs and interrogate leadership state.
// In certain restricted circumstances, Handle may be nil, such as during tests
// that only exercise snapshot restoration, or when initializing a throwaway FSM
// during peers.json recovery - but calling any of the data access methods (read
// or write) will result in a panic.
//
// With Raft, writes and strongly consistent reads must be done on the leader.
// Backend implements a gRPC server, which followers will use to transparently
// forward operations to the leader. To do so, they will obtain a connection
// using Handle.DialLeader. Connections are cached for re-use, so when there's
// a new leader, you must call LeaderChanged to refresh the connection. Leaders
// must accept connections and hand them off by calling Backend.HandleConnection.
// Backend's gRPC client and server *DO NOT* handle TLS themselves, as they are
// intended to communicate over Consul's multiplexed server port (which handles
// TLS).
//
// For more information, see here:
// https://github.com/hashicorp/consul/tree/main/docs/resources#raft-storage-backend
//
// You must call Run before using the backend.
func NewBackend(h Handle, l hclog.Logger) (*Backend, error) {
s, err := inmem.NewStore()
if err != nil {
return nil, err
}
b := &Backend{handle: h, store: s}
b.forwardingServer = newForwardingServer(b)
b.forwardingClient = newForwardingClient(h, l)
return b, nil
}
// Handle provides glue for interacting with the Raft subsystem via existing
// machinery on consul.Server.
type Handle interface {
// Apply the given log message.
Apply(msg []byte) (any, error)
// IsLeader determines if this server is the Raft leader (so can handle writes).
IsLeader() bool
// EnsureStrongConsistency checks the server is able to handle consistent reads by
// verifying its leadership and checking the FSM has applied all queued writes.
EnsureStrongConsistency(ctx context.Context) error
// DialLeader dials a gRPC connection to the leader for forwarding.
DialLeader() (*grpc.ClientConn, error)
}
// Backend is a Raft-backed storage backend implementation.
type Backend struct {
handle Handle
store *inmem.Store
forwardingServer *forwardingServer
forwardingClient *forwardingClient
}
// Run until the given context is canceled. This method blocks, so should be
// called in a goroutine.
func (b *Backend) Run(ctx context.Context) {
group, groupCtx := errgroup.WithContext(ctx)
group.Go(func() error {
b.store.Run(groupCtx)
return nil
})
group.Go(func() error {
return b.forwardingServer.run(groupCtx)
})
group.Wait()
}
// Read implements the storage.Backend interface.
func (b *Backend) Read(ctx context.Context, consistency storage.ReadConsistency, id *pbresource.ID) (*pbresource.Resource, error) {
// Easy case. Both leaders and followers can read from the local store.
if consistency == storage.EventualConsistency {
return b.store.Read(id)
}
if consistency != storage.StrongConsistency {
return nil, fmt.Errorf("%w: unknown consistency: %s", storage.ErrInconsistent, consistency)
}
// We are the leader. Handle the request ourself.
if b.handle.IsLeader() {
return b.leaderRead(ctx, id)
}
// Forward the request to the leader.
rsp, err := b.forwardingClient.read(ctx, &pbstorage.ReadRequest{Id: id})
if err != nil {
return nil, err
}
return rsp.GetResource(), nil
}
func (b *Backend) leaderRead(ctx context.Context, id *pbresource.ID) (*pbresource.Resource, error) {
if err := b.ensureStrongConsistency(ctx); err != nil {
return nil, err
}
return b.store.Read(id)
}
// WriteCAS implements the storage.Backend interface.
func (b *Backend) WriteCAS(ctx context.Context, res *pbresource.Resource) (*pbresource.Resource, error) {
req := &pbstorage.WriteRequest{Resource: res}
if b.handle.IsLeader() {
rsp, err := b.raftApply(&pbstorage.Log{
Type: pbstorage.LogType_LOG_TYPE_WRITE,
Request: &pbstorage.Log_Write{
Write: req,
},
})
if err != nil {
return nil, err
}
return rsp.GetWrite().GetResource(), nil
}
rsp, err := b.forwardingClient.write(ctx, req)
if err != nil {
return nil, err
}
return rsp.GetResource(), nil
}
// DeleteCAS implements the storage.Backend interface.
func (b *Backend) DeleteCAS(ctx context.Context, id *pbresource.ID, version string) error {
req := &pbstorage.DeleteRequest{
Id: id,
Version: version,
}
if b.handle.IsLeader() {
_, err := b.raftApply(&pbstorage.Log{
Type: pbstorage.LogType_LOG_TYPE_DELETE,
Request: &pbstorage.Log_Delete{
Delete: req,
},
})
return err
}
return b.forwardingClient.delete(ctx, req)
}
// List implements the storage.Backend interface.
func (b *Backend) List(ctx context.Context, consistency storage.ReadConsistency, resType storage.UnversionedType, tenancy *pbresource.Tenancy, namePrefix string) ([]*pbresource.Resource, error) {
// Easy case. Both leaders and followers can read from the local store.
if consistency == storage.EventualConsistency {
return b.store.List(resType, tenancy, namePrefix)
}
if consistency != storage.StrongConsistency {
return nil, fmt.Errorf("%w: unknown consistency: %s", storage.ErrInconsistent, consistency)
}
// We are the leader. Handle the request ourself.
if b.handle.IsLeader() {
return b.leaderList(ctx, resType, tenancy, namePrefix)
}
// Forward the request to the leader.
rsp, err := b.forwardingClient.list(ctx, &pbstorage.ListRequest{
Type: &pbresource.Type{
Group: resType.Group,
Kind: resType.Kind,
},
Tenancy: tenancy,
NamePrefix: namePrefix,
})
if err != nil {
return nil, err
}
return rsp.GetResources(), nil
}
func (b *Backend) leaderList(ctx context.Context, resType storage.UnversionedType, tenancy *pbresource.Tenancy, namePrefix string) ([]*pbresource.Resource, error) {
if err := b.ensureStrongConsistency(ctx); err != nil {
return nil, err
}
return b.store.List(resType, tenancy, namePrefix)
}
// WatchList implements the storage.Backend interface.
func (b *Backend) WatchList(_ context.Context, resType storage.UnversionedType, tenancy *pbresource.Tenancy, namePrefix string) (storage.Watch, error) {
return b.store.WatchList(resType, tenancy, namePrefix)
}
// ListByOwner implements the storage.Backend interface.
func (b *Backend) ListByOwner(_ context.Context, id *pbresource.ID) ([]*pbresource.Resource, error) {
return b.store.ListByOwner(id)
}
// Apply is called by the FSM with the bytes of a Raft log entry, with Consul's
// envelope (i.e. type prefix and msgpack wrapper) stripped off.
func (b *Backend) Apply(buf []byte, idx uint64) any {
var req pbstorage.Log
if err := req.UnmarshalBinary(buf); err != nil {
return fmt.Errorf("failed to decode request: %w", err)
}
switch req.Type {
case pbstorage.LogType_LOG_TYPE_WRITE:
res := req.GetWrite().GetResource()
oldVsn := res.Version
res.Version = strconv.Itoa(int(idx))
if err := b.store.WriteCAS(res, oldVsn); err != nil {
return err
}
return &pbstorage.LogResponse{
Response: &pbstorage.LogResponse_Write{
Write: &pbstorage.WriteResponse{Resource: res},
},
}
case pbstorage.LogType_LOG_TYPE_DELETE:
req := req.GetDelete()
if err := b.store.DeleteCAS(req.Id, req.Version); err != nil {
return err
}
return &pbstorage.LogResponse{
Response: &pbstorage.LogResponse_Delete{},
}
}
return fmt.Errorf("unexpected request type: %s", req.Type)
}
// LeaderChanged should be called whenever the current Raft leader changes, to
// drop and re-create the gRPC connection used for forwarding.
func (b *Backend) LeaderChanged() { b.forwardingClient.leaderChanged() }
// HandleConnection should be called whenever a forwarding connection is opened.
func (b *Backend) HandleConnection(conn net.Conn) { b.forwardingServer.listener.Handle(conn) }
// raftApply round trips the given request through the Raft log and FSM.
func (b *Backend) raftApply(req *pbstorage.Log) (*pbstorage.LogResponse, error) {
msg, err := req.MarshalBinary()
if err != nil {
return nil, err
}
rsp, err := b.handle.Apply(msg)
if err != nil {
return nil, err
}
switch t := rsp.(type) {
case *pbstorage.LogResponse:
return t, nil
default:
return nil, fmt.Errorf("unexpected response from Raft apply: %T", rsp)
}
}
func (b *Backend) ensureStrongConsistency(ctx context.Context) error {
if err := b.handle.EnsureStrongConsistency(ctx); err != nil {
return fmt.Errorf("%w: %v", storage.ErrInconsistent, err)
}
return nil
}
// Snapshot obtains a point-in-time snapshot of the backend's state, so that it
// can be written to disk as a backup or sent to bootstrap a follower.
func (b *Backend) Snapshot() (*Snapshot, error) {
s, err := b.store.Snapshot()
if err != nil {
return nil, err
}
return &Snapshot{s}, nil
}
// Snapshot is a point-in-time snapshot of a backend's state.
type Snapshot struct{ s *inmem.Snapshot }
// Next returns the next resource in the snapshot, protobuf encoded. nil bytes
// will be returned when the end of the snapshot has been reached.
func (s *Snapshot) Next() ([]byte, error) {
res := s.s.Next()
if res == nil {
return nil, nil
}
return res.MarshalBinary()
}
// Restore starts the process of restoring a snapshot (i.e. from an on-disk
// backup, or to bootstrap from a leader).
//
// Callers *must* call Abort or Commit when done, to free resources.
func (b *Backend) Restore() (*Restoration, error) {
r, err := b.store.Restore()
if err != nil {
return nil, err
}
return &Restoration{r}, nil
}
// Restoration is a handle that can be used to restore a snapshot.
type Restoration struct{ r *inmem.Restoration }
// Apply the given protobuf-encoded resource to the backend.
func (r *Restoration) Apply(msg []byte) error {
var res pbresource.Resource
if err := res.UnmarshalBinary(msg); err != nil {
return err
}
return r.r.Apply(&res)
}
// Commit the restoration.
func (r *Restoration) Commit() { r.r.Commit() }
// Abort the restoration. It's safe to always call this in a defer statement
// because aborting a committed restoration is a no-op.
func (r *Restoration) Abort() { r.r.Abort() }